├── physionet
    ├── auth.py
    ├── config.py
    ├── project
    │   ├── search.py
    │   ├── __init__.py
    │   ├── metadata.py
    │   └── loader.py
    ├── datathon
    │   ├── __init__.py
    │   └── mlhc.py
    ├── metrics
    │   ├── __init__.py
    │   ├── openalex.py
    │   └── dimensions.py
    ├── __main__.py
    ├── __init__.py
    ├── validate
    │   ├── __init__.py
    │   ├── checks
    │   │   ├── __init__.py
    │   │   ├── documentation.py
    │   │   ├── quality.py
    │   │   ├── integrity.py
    │   │   ├── filesystem.py
    │   │   └── privacy.py
    │   ├── config.py
    │   ├── validator.py
    │   └── models.py
    ├── api
    │   ├── __init__.py
    │   ├── exceptions.py
    │   ├── utils.py
    │   ├── models.py
    │   ├── client.py
    │   └── endpoints.py
    └── cli.py
├── requirements.txt
├── tests
    ├── api
    │   ├── __init__.py
    │   ├── test_exceptions.py
    │   ├── test_utils.py
    │   ├── test_models.py
    │   ├── test_client.py
    │   └── test_endpoints.py
    ├── test_loader.py
    ├── test_search.py
    ├── validate
    │   ├── __init__.py
    │   ├── test_cli.py
    │   ├── test_validator.py
    │   └── test_checks.py
    └── test_dataset.py
├── .gitattributes
├── LICENSE
├── .gitignore
├── .github
    └── workflows
    │   └── run-tests.yml
├── pyproject.toml
└── README.md


/physionet/auth.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/config.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_loader.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_search.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/project/search.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/datathon/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/metrics/openalex.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/project/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/physionet/project/metadata.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/validate/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for validation module."""
2 | 


--------------------------------------------------------------------------------
/physionet/metrics/dimensions.py:
--------------------------------------------------------------------------------
1 | # TODO: add tools for getting metrics from dimensions.ai
2 | 


--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
1 | def test_hello_world():
2 |     assert "Hello, world!" == "Hello, world!"
3 | 


--------------------------------------------------------------------------------
/physionet/__main__.py:
--------------------------------------------------------------------------------
1 | """Allow running the CLI as a module: python -m physionet."""
2 | 
3 | import sys
4 | from physionet.cli import main
5 | 
6 | if __name__ == "__main__":
7 |     sys.exit(main())
8 | 


--------------------------------------------------------------------------------
/physionet/__init__.py:
--------------------------------------------------------------------------------
 1 | from physionet.api import PhysioNetClient
 2 | 
 3 | try:
 4 |     from importlib.metadata import version
 5 |     __version__ = version("physionet")
 6 | except Exception:
 7 |     __version__ = "unknown"
 8 | 
 9 | __all__ = ["PhysioNetClient"]
10 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.py        diff=python
 2 | 
 3 | *.anI       binary
 4 | *.atr       binary
 5 | *.d[0-9]    binary
 6 | *.dat       binary
 7 | *.edf       binary
 8 | *.gz        binary
 9 | *.mat       binary
10 | *.qrs       binary
11 | *.wabp      binary
12 | *.wav       binary
13 | *.wqrs      binary
14 | *.xyz       binary
15 | 


--------------------------------------------------------------------------------
/physionet/validate/__init__.py:
--------------------------------------------------------------------------------
1 | """Dataset validation module for PhysioNet submissions."""
2 | 
3 | from physionet.validate.validator import validate_dataset
4 | from physionet.validate.config import ValidationConfig
5 | from physionet.validate.models import ValidationResult
6 | 
7 | __all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"]
8 | 


--------------------------------------------------------------------------------
/physionet/api/__init__.py:
--------------------------------------------------------------------------------
 1 | from .client import PhysioNetClient
 2 | from .exceptions import (
 3 |     PhysioNetAPIError,
 4 |     BadRequestError,
 5 |     ForbiddenError,
 6 |     NotFoundError,
 7 |     RateLimitError,
 8 | )
 9 | 
10 | __all__ = [
11 |     "PhysioNetClient",
12 |     "PhysioNetAPIError",
13 |     "BadRequestError",
14 |     "ForbiddenError",
15 |     "NotFoundError",
16 |     "RateLimitError",
17 | ]
18 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/__init__.py:
--------------------------------------------------------------------------------
 1 | """Validation check modules."""
 2 | 
 3 | from physionet.validate.checks.filesystem import check_filesystem
 4 | from physionet.validate.checks.documentation import check_documentation
 5 | from physionet.validate.checks.integrity import check_integrity
 6 | from physionet.validate.checks.quality import check_quality
 7 | from physionet.validate.checks.privacy import check_privacy
 8 | 
 9 | __all__ = [
10 |     "check_filesystem",
11 |     "check_documentation",
12 |     "check_integrity",
13 |     "check_quality",
14 |     "check_privacy",
15 | ]
16 | 


--------------------------------------------------------------------------------
/physionet/api/exceptions.py:
--------------------------------------------------------------------------------
 1 | class PhysioNetAPIError(Exception):
 2 |     """Base exception for PhysioNet API errors."""
 3 | 
 4 |     pass
 5 | 
 6 | 
 7 | class BadRequestError(PhysioNetAPIError):
 8 |     """Raised when API returns 400 Bad Request."""
 9 | 
10 |     pass
11 | 
12 | 
13 | class ForbiddenError(PhysioNetAPIError):
14 |     """Raised when API returns 403 Forbidden."""
15 | 
16 |     pass
17 | 
18 | 
19 | class NotFoundError(PhysioNetAPIError):
20 |     """Raised when API returns 404 Not Found."""
21 | 
22 |     pass
23 | 
24 | 
25 | class RateLimitError(PhysioNetAPIError):
26 |     """Raised when API returns 429 Too Many Requests."""
27 | 
28 |     pass
29 | 


--------------------------------------------------------------------------------
/physionet/project/loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The dataset module contains code for loading and processing PhysioNet data.
 3 | """
 4 | import requests
 5 | 
 6 | 
 7 | def hello():
 8 |     print("Hello world!")
 9 | 
10 | 
11 | def _get_request(root='https://physionet.org/api/v1/',
12 |                  endpoint='project/published/'):
13 |     """
14 |     Make a GET request to the PhysioNet API.
15 | 
16 |     Returns:
17 |         response (requests.models.Response): Response object from the API call.
18 |     """
19 |     url = root + endpoint
20 |     response = requests.get(url)
21 | 
22 |     if not response.status_code == 200:
23 |         raise Exception(f'Error: {response.status_code}')
24 | 
25 |     return response
26 | 


--------------------------------------------------------------------------------
/physionet/api/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import os
 3 | 
 4 | 
 5 | def get_credentials_from_env() -> tuple[Optional[str], Optional[str]]:
 6 |     """
 7 |     Get PhysioNet credentials from environment variables.
 8 | 
 9 |     Returns:
10 |         Tuple of (username, password) or (None, None)
11 |     """
12 |     username = os.getenv("PHYSIONET_USERNAME")
13 |     password = os.getenv("PHYSIONET_PASSWORD")
14 |     return username, password
15 | 
16 | 
17 | def format_size(size_bytes: int) -> str:
18 |     """
19 |     Format bytes to human-readable size.
20 | 
21 |     Args:
22 |         size_bytes: Size in bytes
23 | 
24 |     Returns:
25 |         Formatted string (e.g., "1.5 GB")
26 |     """
27 |     for unit in ["B", "KB", "MB", "GB", "TB"]:
28 |         if size_bytes < 1024.0:
29 |             return f"{size_bytes:.2f} {unit}"
30 |         size_bytes /= 1024.0
31 |     return f"{size_bytes:.2f} PB"
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Tom Pollard
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .env
11 | .Python
12 | env/
13 | venv/
14 | .venv/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyPI credentials
31 | .pypirc
32 | 
33 | # PyInstaller
34 | #  Usually these files are written by a python script from a template
35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 | 
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 | 
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *,cover
52 | .hypothesis/
53 | 
54 | # Translations
55 | *.mo
56 | *.pot
57 | 
58 | # Django stuff:
59 | *.log
60 | 
61 | # Sphinx documentation
62 | docs/_build/
63 | 
64 | # PyBuilder
65 | target/
66 | 
67 | # Ipython Notebook
68 | .ipynb_checkpoints
69 | 
70 | # DS_Store files
71 | .DS_Store
72 | 
73 | # Local config
74 | .vscode
75 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         python-version: ["3.9", "3.10"]
19 |         include:
20 |           - python-version: "3.10"
21 |             coverage: 1
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install flake8 pyright
32 |         python -m pip install -e ".[dev]"
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest .
42 | 


--------------------------------------------------------------------------------
/tests/api/test_exceptions.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from physionet.api.exceptions import (
 3 |     PhysioNetAPIError,
 4 |     BadRequestError,
 5 |     ForbiddenError,
 6 |     NotFoundError,
 7 |     RateLimitError,
 8 | )
 9 | 
10 | 
11 | def test_base_exception():
12 |     """Test base PhysioNetAPIError exception."""
13 |     with pytest.raises(PhysioNetAPIError):
14 |         raise PhysioNetAPIError("Test error")
15 | 
16 | 
17 | def test_bad_request_error():
18 |     """Test BadRequestError is a subclass of PhysioNetAPIError."""
19 |     with pytest.raises(PhysioNetAPIError):
20 |         raise BadRequestError("Bad request")
21 | 
22 | 
23 | def test_forbidden_error():
24 |     """Test ForbiddenError is a subclass of PhysioNetAPIError."""
25 |     with pytest.raises(PhysioNetAPIError):
26 |         raise ForbiddenError("Forbidden")
27 | 
28 | 
29 | def test_not_found_error():
30 |     """Test NotFoundError is a subclass of PhysioNetAPIError."""
31 |     with pytest.raises(PhysioNetAPIError):
32 |         raise NotFoundError("Not found")
33 | 
34 | 
35 | def test_rate_limit_error():
36 |     """Test RateLimitError is a subclass of PhysioNetAPIError."""
37 |     with pytest.raises(PhysioNetAPIError):
38 |         raise RateLimitError("Rate limit exceeded")
39 | 
40 | 
41 | def test_exception_messages():
42 |     """Test that exception messages are preserved."""
43 |     error_msg = "Custom error message"
44 | 
45 |     try:
46 |         raise BadRequestError(error_msg)
47 |     except BadRequestError as e:
48 |         assert str(e) == error_msg
49 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "physionet"
 7 | version = "0.1.6"
 8 | authors = [
 9 |   { name="Tom Pollard", email="tpollard@mit.edu" },
10 | ]
11 | license = {file = "LICENSE"}
12 | description = "A collection of tools for working with the PhysioNet repository."
13 | readme = "README.md"
14 | requires-python = ">=3.9"
15 | keywords=["physionet", "mimic", "medical", "dataset"]
16 | classifiers = [
17 |     "Programming Language :: Python :: 3",
18 |     "License :: OSI Approved :: MIT License",
19 |     "Operating System :: OS Independent",
20 | ]
21 | dependencies = [
22 |     "numpy",
23 |     "pandas",
24 |     "openpyxl",
25 |     "requests",
26 |     "tqdm",
27 | ]
28 | 
29 | [project.optional-dependencies]
30 | dev = [
31 |     "pytest>=7.0.0",
32 |     "requests-mock>=1.9.0",
33 | ]
34 | build = [
35 |     "build>=0.10.0",
36 |     "twine>=4.0.0",
37 | ]
38 | 
39 | [tool.black]
40 | line-length = 119
41 | 
42 | [tool.pyright]
43 | reportMissingImports = true
44 | 
45 | [project.scripts]
46 | physionet = "physionet.cli:main"
47 | 
48 | [project.urls]
49 | homepage = "https://github.com/MIT-LCP/physionet"
50 | repository = "https://github.com/MIT-LCP/physionet"
51 | 
52 | [tool.hatch.build.targets.sdist]
53 | exclude = [
54 |     "venv/",
55 |     "env/",
56 |     ".venv/",
57 |     "*.egg-info/",
58 |     "dist/",
59 |     "build/",
60 |     ".pytest_cache/",
61 |     ".git/",
62 |     ".github/",
63 |     "__pycache__/",
64 |     "*.pyc",
65 |     ".DS_Store",
66 | ]
67 | 
68 | [tool.hatch.build.targets.wheel]
69 | packages = ["physionet"]
70 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/documentation.py:
--------------------------------------------------------------------------------
 1 | """Documentation validation checks."""
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
 6 | from physionet.validate.config import ValidationConfig
 7 | 
 8 | 
 9 | def check_documentation(path: Path, config: ValidationConfig) -> CheckResult:
10 |     """
11 |     Check documentation completeness.
12 | 
13 |     Validates:
14 |     - Required files exist (if any are specified in config)
15 | 
16 |     Args:
17 |         path: Path to dataset directory
18 |         config: Validation configuration
19 | 
20 |     Returns:
21 |         CheckResult with any documentation issues found
22 |     """
23 |     result = CheckResult(category=CheckCategory.DOCUMENTATION)
24 | 
25 |     # Check for required files
26 |     for required_file in config.required_files:
27 |         file_path = path / required_file
28 |         if not file_path.exists():
29 |             # Customize suggestion for README.md
30 |             if required_file == "README.md":
31 |                 suggestion = (
32 |                     "Add README.md to your dataset. At minimum, the file should include "
33 |                     "a title and a brief description of the package content."
34 |                 )
35 |             else:
36 |                 suggestion = f"Add {required_file} to your dataset"
37 | 
38 |             result.issues.append(
39 |                 ValidationIssue(
40 |                     severity=Severity.ERROR,
41 |                     category=CheckCategory.DOCUMENTATION,
42 |                     file=required_file,
43 |                     message=f"Required file not found: {required_file}",
44 |                     suggestion=suggestion,
45 |                 )
46 |             )
47 | 
48 |     return result
49 | 


--------------------------------------------------------------------------------
/physionet/datathon/mlhc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Temporary module for the MLHC Professional Studies Class.
 3 | """
 4 | from collections import Counter
 5 | 
 6 | from google.colab import widgets
 7 | import numpy as np
 8 | 
 9 | 
10 | def visualize_notes(notes, hadm_id):
11 |     """
12 |     Temporary function for visualizing notes.
13 |     """
14 |     # When did this patient arrive (useful for getting first 48 hours)
15 |     admittime = notes[notes.hadm_id == hadm_id].admittime.values[0]
16 | 
17 |     # Get the notes for this patient
18 |     notes_subject = notes.loc[notes.hadm_id == hadm_id]
19 | 
20 |     # How many notes for each category?
21 |     category_counts = Counter(notes_subject.category.values)
22 |     category_sorted = sorted(category_counts.keys(), key=lambda t: category_counts[t], reverse=True)
23 | 
24 |     # Outer tab is for different category of notes
25 |     outer_tab = widgets.TabBar(category_sorted, location="top")
26 |     for category in category_sorted:
27 |         with outer_tab.output_to(category):
28 |             notes_cat = notes_subject.loc[notes_subject.category == category]
29 |             titles = []
30 |             for num, (i, row) in enumerate(notes_cat.iterrows()):
31 |                 # Format the text with additional metadata
32 |                 time_offset = (row.charttime - admittime).total_seconds() / 3600.0
33 |                 time_offset = int(time_offset) if not np.isnan(time_offset) else "n/a"
34 | 
35 |                 # Only first 48 hours of data
36 |                 titles += ["%s Note #%d (%s Hours)" % (category, num, time_offset)]
37 | 
38 |             # Inner tab is for each note in a category
39 |             inner_tab = widgets.TabBar(titles, location="start")
40 |             for i in range(len(titles)):
41 |                 with inner_tab.output_to(titles[i]):
42 |                     print(notes_cat.iloc[i]["text"])
43 | 


--------------------------------------------------------------------------------
/physionet/validate/config.py:
--------------------------------------------------------------------------------
 1 | """Configuration for validation checks."""
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import Dict, List, Optional, Tuple
 5 | 
 6 | 
 7 | @dataclass
 8 | class ValidationConfig:
 9 |     """Configuration for dataset validation."""
10 | 
11 |     # General settings
12 |     check_filesystem: bool = True
13 |     check_documentation: bool = True
14 |     check_integrity: bool = True
15 |     check_quality: bool = True
16 |     check_phi: bool = True
17 | 
18 |     # File system settings
19 |     max_file_size_bytes: Optional[int] = None  # None = no limit
20 |     warn_small_files_threshold: int = 100  # Warn if more than this many small files
21 |     ignore_patterns: List[str] = field(default_factory=lambda: [
22 |         ".git", ".gitignore", ".DS_Store", "__pycache__", "*.pyc", ".pytest_cache"
23 |     ])
24 | 
25 |     # Documentation settings
26 |     required_files: List[str] = field(default_factory=lambda: ["README.md"])
27 |     recommended_readme_sections: List[str] = field(default_factory=list)
28 | 
29 |     # Performance settings
30 |     max_rows_to_scan: Optional[int] = 10000  # Max rows to scan per CSV for privacy/quality checks (None = all rows)
31 |     sample_large_files: bool = True  # If True, sample rows from large files instead of scanning all
32 | 
33 |     # Quality settings
34 |     missing_value_threshold: float = 1.0  # Warn if column has 100% missing values
35 |     value_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict)
36 |     # Example: {"heart_rate": (20, 300), "temperature": (32, 43)}
37 | 
38 |     # Privacy settings
39 |     allowed_age_max: int = 89
40 |     phi_patterns: List[str] = field(default_factory=lambda: [
41 |         r"\b\d{3}-\d{2}-\d{4}\b",  # SSN pattern
42 |         r"\b[\w\.-]+@[\w\.-]+\.\w+\b",  # Email pattern
43 |         r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",  # Phone pattern
44 |     ])
45 | 


--------------------------------------------------------------------------------
/tests/api/test_utils.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from physionet.api.utils import get_credentials_from_env, format_size
 4 | 
 5 | 
 6 | def test_get_credentials_from_env_with_credentials(monkeypatch):
 7 |     """Test getting credentials from environment variables."""
 8 |     monkeypatch.setenv("PHYSIONET_USERNAME", "testuser")
 9 |     monkeypatch.setenv("PHYSIONET_PASSWORD", "testpass")
10 | 
11 |     username, password = get_credentials_from_env()
12 | 
13 |     assert username == "testuser"
14 |     assert password == "testpass"
15 | 
16 | 
17 | def test_get_credentials_from_env_without_credentials(monkeypatch):
18 |     """Test getting credentials when environment variables are not set."""
19 |     monkeypatch.delenv("PHYSIONET_USERNAME", raising=False)
20 |     monkeypatch.delenv("PHYSIONET_PASSWORD", raising=False)
21 | 
22 |     username, password = get_credentials_from_env()
23 | 
24 |     assert username is None
25 |     assert password is None
26 | 
27 | 
28 | def test_get_credentials_from_env_partial(monkeypatch):
29 |     """Test getting credentials when only one variable is set."""
30 |     monkeypatch.setenv("PHYSIONET_USERNAME", "testuser")
31 |     monkeypatch.delenv("PHYSIONET_PASSWORD", raising=False)
32 | 
33 |     username, password = get_credentials_from_env()
34 | 
35 |     assert username == "testuser"
36 |     assert password is None
37 | 
38 | 
39 | def test_format_size_bytes():
40 |     """Test formatting bytes."""
41 |     assert format_size(100) == "100.00 B"
42 |     assert format_size(512) == "512.00 B"
43 | 
44 | 
45 | def test_format_size_kilobytes():
46 |     """Test formatting kilobytes."""
47 |     assert format_size(1024) == "1.00 KB"
48 |     assert format_size(1536) == "1.50 KB"
49 |     assert format_size(2048) == "2.00 KB"
50 | 
51 | 
52 | def test_format_size_megabytes():
53 |     """Test formatting megabytes."""
54 |     assert format_size(1024 * 1024) == "1.00 MB"
55 |     assert format_size(1024 * 1024 * 5) == "5.00 MB"
56 |     assert format_size(1024 * 1024 * 1.5) == "1.50 MB"
57 | 
58 | 
59 | def test_format_size_gigabytes():
60 |     """Test formatting gigabytes."""
61 |     assert format_size(1024 * 1024 * 1024) == "1.00 GB"
62 |     assert format_size(1024 * 1024 * 1024 * 2.5) == "2.50 GB"
63 | 
64 | 
65 | def test_format_size_terabytes():
66 |     """Test formatting terabytes."""
67 |     assert format_size(1024 * 1024 * 1024 * 1024) == "1.00 TB"
68 |     assert format_size(1024 * 1024 * 1024 * 1024 * 3) == "3.00 TB"
69 | 
70 | 
71 | def test_format_size_petabytes():
72 |     """Test formatting petabytes."""
73 |     assert format_size(1024 * 1024 * 1024 * 1024 * 1024) == "1.00 PB"
74 |     assert format_size(1024 * 1024 * 1024 * 1024 * 1024 * 2) == "2.00 PB"
75 | 
76 | 
77 | def test_format_size_edge_cases():
78 |     """Test edge cases for size formatting."""
79 |     assert format_size(0) == "0.00 B"
80 |     assert format_size(1) == "1.00 B"
81 |     assert format_size(1023) == "1023.00 B"
82 | 


--------------------------------------------------------------------------------
/physionet/api/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional, List, Any
 3 | 
 4 | 
 5 | @dataclass
 6 | class ProjectVersion:
 7 |     """Represents a project version."""
 8 | 
 9 |     slug: str
10 |     title: str
11 |     version: str
12 |     abstract: str
13 |     citation: str
14 | 
15 | 
16 | @dataclass
17 | class PublishedProject:
18 |     """Represents a published project."""
19 | 
20 |     slug: str
21 |     version: str
22 |     title: str
23 |     short_description: str
24 |     abstract: str
25 |     core_doi: Optional[str]
26 |     version_doi: Optional[str]
27 |     is_latest_version: bool
28 |     publish_date: str
29 |     license: Optional[dict]
30 |     dua: Optional[dict]
31 |     main_storage_size: int
32 |     compressed_storage_size: int
33 | 
34 |     @classmethod
35 |     def from_dict(cls, data: dict) -> "PublishedProject":
36 |         """Create instance from API response dictionary."""
37 |         return cls(
38 |             slug=data["slug"],
39 |             version=data["version"],
40 |             title=data["title"],
41 |             short_description=data.get("short_description", ""),
42 |             abstract=data.get("abstract", ""),
43 |             core_doi=data.get("core_doi"),
44 |             version_doi=data.get("version_doi"),
45 |             is_latest_version=data.get("is_latest_version", False),
46 |             publish_date=data.get("publish_date", ""),
47 |             license=data.get("license"),
48 |             dua=data.get("dua"),
49 |             main_storage_size=data.get("main_storage_size", 0),
50 |             compressed_storage_size=data.get("compressed_storage_size", 0),
51 |         )
52 | 
53 | 
54 | @dataclass
55 | class ProjectDetail:
56 |     """Detailed project information."""
57 | 
58 |     slug: str
59 |     title: str
60 |     version: str
61 |     abstract: str
62 |     license: Optional[dict]
63 |     short_description: str
64 |     project_home_page: Optional[str]
65 |     publish_datetime: str
66 |     doi: str
67 |     main_storage_size: int
68 |     compressed_storage_size: int
69 | 
70 |     @classmethod
71 |     def from_dict(cls, data: dict) -> "ProjectDetail":
72 |         """Create instance from API response dictionary."""
73 |         return cls(
74 |             slug=data["slug"],
75 |             title=data["title"],
76 |             version=data["version"],
77 |             abstract=data.get("abstract", ""),
78 |             license=data.get("license"),
79 |             short_description=data.get("short_description", ""),
80 |             project_home_page=data.get("project_home_page"),
81 |             publish_datetime=data.get("publish_datetime", ""),
82 |             doi=data.get("doi", ""),
83 |             main_storage_size=data.get("main_storage_size", 0),
84 |             compressed_storage_size=data.get("compressed_storage_size", 0),
85 |         )
86 | 
87 | 
88 | @dataclass
89 | class PaginatedResponse:
90 |     """Paginated API response."""
91 | 
92 |     count: int
93 |     next: Optional[str]
94 |     previous: Optional[str]
95 |     results: List[Any]
96 | 


--------------------------------------------------------------------------------
/physionet/api/client.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from typing import Optional, Dict, Any
  3 | from urllib.parse import urljoin
  4 | 
  5 | from .exceptions import (
  6 |     PhysioNetAPIError,
  7 |     BadRequestError,
  8 |     ForbiddenError,
  9 |     NotFoundError,
 10 |     RateLimitError,
 11 | )
 12 | from .endpoints import ProjectsAPI
 13 | 
 14 | 
 15 | class PhysioNetClient:
 16 |     """Main client for interacting with PhysioNet API v1."""
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         base_url: str = "https://physionet.org",
 21 |         username: Optional[str] = None,
 22 |         password: Optional[str] = None,
 23 |         timeout: int = 30,
 24 |     ):
 25 |         """
 26 |         Initialize PhysioNet API client.
 27 | 
 28 |         Args:
 29 |             base_url: Base URL for PhysioNet (default: https://physionet.org)
 30 |             username: Optional username for authenticated requests
 31 |             password: Optional password for authenticated requests
 32 |             timeout: Request timeout in seconds
 33 |         """
 34 |         self.base_url = base_url.rstrip("/")
 35 |         self.api_base = f"{self.base_url}/api/v1/"
 36 |         self.timeout = timeout
 37 |         self.session = requests.Session()
 38 | 
 39 |         if username and password:
 40 |             self.session.auth = (username, password)
 41 | 
 42 |         self.session.headers.update({"User-Agent": "PhysioNet-Python-Client/1.0", "Accept": "application/json"})
 43 | 
 44 |         self.projects = ProjectsAPI(self)
 45 | 
 46 |     def _make_request(
 47 |         self, method: str, endpoint: str, params: Optional[Dict[str, Any]] = None, **kwargs
 48 |     ) -> requests.Response:
 49 |         """
 50 |         Make HTTP request to API.
 51 | 
 52 |         Args:
 53 |             method: HTTP method (GET, POST, etc.)
 54 |             endpoint: API endpoint path
 55 |             params: Query parameters
 56 |             **kwargs: Additional arguments for requests
 57 | 
 58 |         Returns:
 59 |             Response object
 60 | 
 61 |         Raises:
 62 |             PhysioNetAPIError: On API errors
 63 |             requests.RequestException: On network errors
 64 |         """
 65 |         url = urljoin(self.api_base, endpoint)
 66 | 
 67 |         response = self.session.request(method=method, url=url, params=params, timeout=self.timeout, **kwargs)
 68 | 
 69 |         if response.status_code >= 400:
 70 |             self._handle_error(response)
 71 | 
 72 |         return response
 73 | 
 74 |     def _handle_error(self, response: requests.Response):
 75 |         """Handle API error responses."""
 76 |         try:
 77 |             error_data = response.json()
 78 |             error_msg = error_data.get("error", str(error_data))
 79 |         except Exception:
 80 |             error_msg = response.text or response.reason
 81 | 
 82 |         if response.status_code == 400:
 83 |             raise BadRequestError(error_msg)
 84 |         elif response.status_code == 403:
 85 |             raise ForbiddenError(error_msg)
 86 |         elif response.status_code == 404:
 87 |             raise NotFoundError(error_msg)
 88 |         elif response.status_code == 429:
 89 |             raise RateLimitError(error_msg)
 90 |         else:
 91 |             raise PhysioNetAPIError(f"HTTP {response.status_code}: {error_msg}")
 92 | 
 93 |     def close(self):
 94 |         """Close the session."""
 95 |         self.session.close()
 96 | 
 97 |     def __enter__(self):
 98 |         return self
 99 | 
100 |     def __exit__(self, exc_type, exc_val, exc_tb):
101 |         self.close()
102 | 


--------------------------------------------------------------------------------
/physionet/api/endpoints.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Iterator
  2 | from physionet.api.models import PublishedProject, ProjectVersion, ProjectDetail
  3 | 
  4 | 
  5 | class ProjectsAPI:
  6 |     """API methods for interacting with projects."""
  7 | 
  8 |     def __init__(self, client):
  9 |         self.client = client
 10 | 
 11 |     def list_published(self) -> List[PublishedProject]:
 12 |         """
 13 |         List all published projects.
 14 | 
 15 |         Returns:
 16 |             List of PublishedProject objects
 17 | 
 18 |         Note:
 19 |             The API returns all projects in a single response (no pagination).
 20 |         """
 21 |         response = self.client._make_request("GET", "projects/published/")
 22 |         data = response.json()
 23 | 
 24 |         return [PublishedProject.from_dict(p) for p in data]
 25 | 
 26 |     def iter_published(self) -> Iterator[PublishedProject]:
 27 |         """
 28 |         Iterator that yields all published projects.
 29 | 
 30 |         Yields:
 31 |             PublishedProject objects
 32 | 
 33 |         Note:
 34 |             This is a convenience method that iterates over list_published() results.
 35 |         """
 36 |         for project in self.list_published():
 37 |             yield project
 38 | 
 39 |     def search(self, search_term: str, resource_type: Optional[List[str]] = None) -> List[PublishedProject]:
 40 |         """
 41 |         Search published projects.
 42 | 
 43 |         Args:
 44 |             search_term: Search keywords
 45 |             resource_type: Filter by resource type(s), or ['all'] for all types
 46 | 
 47 |         Returns:
 48 |             List of matching PublishedProject objects
 49 |         """
 50 |         params = {"search_term": search_term}
 51 | 
 52 |         if resource_type:
 53 |             params["resource_type"] = resource_type
 54 | 
 55 |         response = self.client._make_request("GET", "projects/search/", params=params)
 56 |         data = response.json()
 57 | 
 58 |         return [PublishedProject.from_dict(p) for p in data]
 59 | 
 60 |     def list_versions(self, project_slug: str) -> List[ProjectVersion]:
 61 |         """
 62 |         List all versions of a project.
 63 | 
 64 |         Args:
 65 |             project_slug: Project identifier
 66 | 
 67 |         Returns:
 68 |             List of ProjectVersion objects
 69 |         """
 70 |         endpoint = f"projects/{project_slug}/versions/"
 71 |         response = self.client._make_request("GET", endpoint)
 72 |         data = response.json()
 73 | 
 74 |         return [
 75 |             ProjectVersion(
 76 |                 slug=v["slug"],
 77 |                 title=v["title"],
 78 |                 version=v["version"],
 79 |                 abstract=v["abstract"],
 80 |                 citation=v["citation"],
 81 |             )
 82 |             for v in data
 83 |         ]
 84 | 
 85 |     def get_details(self, project_slug: str, version: str) -> ProjectDetail:
 86 |         """
 87 |         Get detailed information about a specific project version.
 88 | 
 89 |         Args:
 90 |             project_slug: Project identifier
 91 |             version: Version number
 92 | 
 93 |         Returns:
 94 |             ProjectDetail object
 95 |         """
 96 |         endpoint = f"projects/{project_slug}/versions/{version}/"
 97 |         response = self.client._make_request("GET", endpoint)
 98 |         data = response.json()
 99 | 
100 |         return ProjectDetail.from_dict(data)
101 | 
102 |     def download_checksums(self, project_slug: str, version: str, output_path: str):
103 |         """
104 |         Download SHA256 checksums file for a project.
105 | 
106 |         Args:
107 |             project_slug: Project identifier
108 |             version: Version number
109 |             output_path: Path to save the checksums file
110 | 
111 |         Note:
112 |             Requires authentication and project access permissions.
113 |         """
114 |         endpoint = f"projects/published/{project_slug}/{version}/sha256sums/"
115 |         response = self.client._make_request("GET", endpoint)
116 | 
117 |         with open(output_path, "wb") as f:
118 |             f.write(response.content)
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PhysioNet
  2 | 
  3 | A collection of tools for working with the [PhysioNet](http://physionet.org/) repository.
  4 | 
  5 | ## Installation
  6 | 
  7 | ```bash
  8 | pip install physionet
  9 | ```
 10 | 
 11 | ## Usage
 12 | 
 13 | ### PhysioNet "Preflight"
 14 | 
 15 | Validate your dataset before submission to PhysioNet:
 16 | 
 17 | ```bash
 18 | # Validate a dataset
 19 | physionet validate /path/to/dataset
 20 | 
 21 | # Run specific checks only
 22 | physionet validate /path/to/dataset --checks filesystem,privacy
 23 | 
 24 | # Disable sampling for complete validation (slower)
 25 | physionet validate /path/to/dataset --no-sampling
 26 | ```
 27 | 
 28 | The validator checks for:
 29 | 
 30 | - File naming issues (spaces, special characters, long names)
 31 | - Proprietary formats (suggests open alternatives)
 32 | - Missing documentation (README.md)
 33 | - CSV integrity (structure, encoding, duplicate columns)
 34 | - Data quality (missing values, out-of-range data)
 35 | - Privacy concerns (PHI patterns, sensitive files)
 36 | 
 37 | A validation report (PHYSIONET_REPORT.md) is automatically saved in your dataset folder.
 38 | 
 39 | ### API Client
 40 | 
 41 | Interact with the PhysioNet API to explore and search published projects:
 42 | 
 43 | ```python
 44 | from physionet import PhysioNetClient
 45 | 
 46 | # Create a client instance
 47 | client = PhysioNetClient()
 48 | 
 49 | # List all published projects
 50 | projects = client.projects.list_published()
 51 | print(f"Total projects: {len(projects)}")
 52 | 
 53 | # Display first few projects
 54 | for project in projects[:5]:
 55 |     print(f"{project.slug} v{project.version}: {project.title}")
 56 | 
 57 | # Search for projects
 58 | ecg_projects = client.projects.search('ECG')
 59 | print(f"Found {len(ecg_projects)} ECG-related projects")
 60 | 
 61 | # Get all versions of a project
 62 | versions = client.projects.list_versions('mimic-iv-demo')
 63 | for version in versions:
 64 |     print(f"Version {version.version}: {version.title}")
 65 | 
 66 | # Get detailed information about a specific version
 67 | details = client.projects.get_details('mimic-iv-demo', '2.2')
 68 | print(f"Title: {details.title}")
 69 | print(f"DOI: {details.doi}")
 70 | print(f"Published: {details.publish_datetime}")
 71 | print(f"Size: {details.main_storage_size} bytes")
 72 | ```
 73 | 
 74 | ### Authenticated Requests
 75 | 
 76 | For endpoints that require authentication (e.g., downloading checksums):
 77 | 
 78 | ```python
 79 | from physionet import PhysioNetClient
 80 | 
 81 | # Create client with authentication
 82 | client = PhysioNetClient(
 83 |     username='your_username',
 84 |     password='your_password'
 85 | )
 86 | 
 87 | # Download checksums file
 88 | client.projects.download_checksums(
 89 |     'mimic-iv-demo',
 90 |     '2.2',
 91 |     'checksums.txt'
 92 | )
 93 | 
 94 | # Or use environment variables
 95 | # Set PHYSIONET_USERNAME and PHYSIONET_PASSWORD
 96 | from physionet.api.utils import get_credentials_from_env
 97 | 
 98 | username, password = get_credentials_from_env()
 99 | client = PhysioNetClient(username=username, password=password)
100 | ```
101 | 
102 | ### Using Context Manager
103 | 
104 | ```python
105 | from physionet import PhysioNetClient
106 | 
107 | # Automatically close session when done
108 | with PhysioNetClient() as client:
109 |     projects = client.projects.list_published()
110 |     print(f"Found {len(projects)} projects")
111 | ```
112 | 
113 | ### Utility Functions
114 | 
115 | ```python
116 | from physionet.api.utils import format_size
117 | 
118 | # Format bytes to human-readable size
119 | size = format_size(16224447)
120 | print(size)  # "15.47 MB"
121 | ```
122 | 
123 | ## Error Handling
124 | 
125 | ```python
126 | from physionet import PhysioNetClient
127 | from physionet.api.exceptions import NotFoundError, RateLimitError, ForbiddenError
128 | 
129 | client = PhysioNetClient()
130 | 
131 | try:
132 |     details = client.projects.get_details('nonexistent-project', '1.0')
133 | except NotFoundError:
134 |     print("Project not found")
135 | except RateLimitError:
136 |     print("Rate limit exceeded, please wait before retrying")
137 | except ForbiddenError:
138 |     print("Access denied - check credentials or project permissions")
139 | ```
140 | 
141 | ## Contributing
142 | 
143 | Contributions are welcome!
144 | 
145 | ## License
146 | 
147 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
148 | 


--------------------------------------------------------------------------------
/tests/api/test_models.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from physionet.api.models import (
  3 |     ProjectVersion,
  4 |     PublishedProject,
  5 |     ProjectDetail,
  6 |     PaginatedResponse,
  7 | )
  8 | 
  9 | 
 10 | def test_project_version_creation():
 11 |     """Test ProjectVersion dataclass creation."""
 12 |     pv = ProjectVersion(
 13 |         slug="test-project",
 14 |         title="Test Project",
 15 |         version="1.0",
 16 |         abstract="Test abstract",
 17 |         citation="Test citation",
 18 |     )
 19 | 
 20 |     assert pv.slug == "test-project"
 21 |     assert pv.title == "Test Project"
 22 |     assert pv.version == "1.0"
 23 |     assert pv.abstract == "Test abstract"
 24 |     assert pv.citation == "Test citation"
 25 | 
 26 | 
 27 | def test_published_project_from_dict():
 28 |     """Test PublishedProject creation from API response dict."""
 29 |     data = {
 30 |         "slug": "mimic-iv-demo",
 31 |         "version": "2.2",
 32 |         "title": "MIMIC-IV Demo",
 33 |         "short_description": "Demo dataset",
 34 |         "abstract": "Abstract text",
 35 |         "core_doi": "10.1234/test",
 36 |         "version_doi": "10.1234/test.v2.2",
 37 |         "is_latest_version": True,
 38 |         "publish_date": "2023-01-01",
 39 |         "license": {"name": "MIT"},
 40 |         "dua": None,
 41 |         "main_storage_size": 1000000,
 42 |         "compressed_storage_size": 500000,
 43 |     }
 44 | 
 45 |     project = PublishedProject.from_dict(data)
 46 | 
 47 |     assert project.slug == "mimic-iv-demo"
 48 |     assert project.version == "2.2"
 49 |     assert project.title == "MIMIC-IV Demo"
 50 |     assert project.short_description == "Demo dataset"
 51 |     assert project.core_doi == "10.1234/test"
 52 |     assert project.is_latest_version is True
 53 |     assert project.main_storage_size == 1000000
 54 | 
 55 | 
 56 | def test_published_project_from_dict_with_missing_fields():
 57 |     """Test PublishedProject handles missing optional fields."""
 58 |     data = {
 59 |         "slug": "test-project",
 60 |         "version": "1.0",
 61 |         "title": "Test",
 62 |     }
 63 | 
 64 |     project = PublishedProject.from_dict(data)
 65 | 
 66 |     assert project.slug == "test-project"
 67 |     assert project.version == "1.0"
 68 |     assert project.title == "Test"
 69 |     assert project.short_description == ""
 70 |     assert project.abstract == ""
 71 |     assert project.core_doi is None
 72 |     assert project.is_latest_version is False
 73 |     assert project.main_storage_size == 0
 74 | 
 75 | 
 76 | def test_project_detail_from_dict():
 77 |     """Test ProjectDetail creation from API response dict."""
 78 |     data = {
 79 |         "slug": "test-project",
 80 |         "title": "Test Project",
 81 |         "version": "1.0",
 82 |         "abstract": "Test abstract",
 83 |         "license": {"name": "MIT"},
 84 |         "short_description": "Short desc",
 85 |         "project_home_page": "https://example.com",
 86 |         "publish_datetime": "2023-01-01T00:00:00",
 87 |         "doi": "10.1234/test",
 88 |         "main_storage_size": 1000000,
 89 |         "compressed_storage_size": 500000,
 90 |     }
 91 | 
 92 |     detail = ProjectDetail.from_dict(data)
 93 | 
 94 |     assert detail.slug == "test-project"
 95 |     assert detail.title == "Test Project"
 96 |     assert detail.version == "1.0"
 97 |     assert detail.doi == "10.1234/test"
 98 |     assert detail.project_home_page == "https://example.com"
 99 | 
100 | 
101 | def test_project_detail_from_dict_with_missing_fields():
102 |     """Test ProjectDetail handles missing optional fields."""
103 |     data = {
104 |         "slug": "test-project",
105 |         "title": "Test",
106 |         "version": "1.0",
107 |     }
108 | 
109 |     detail = ProjectDetail.from_dict(data)
110 | 
111 |     assert detail.slug == "test-project"
112 |     assert detail.abstract == ""
113 |     assert detail.license is None
114 |     assert detail.project_home_page is None
115 |     assert detail.doi == ""
116 |     assert detail.main_storage_size == 0
117 | 
118 | 
119 | def test_paginated_response_creation():
120 |     """Test PaginatedResponse creation."""
121 |     response = PaginatedResponse(
122 |         count=100,
123 |         next="https://api.example.com/page2",
124 |         previous=None,
125 |         results=["item1", "item2", "item3"],
126 |     )
127 | 
128 |     assert response.count == 100
129 |     assert response.next == "https://api.example.com/page2"
130 |     assert response.previous is None
131 |     assert len(response.results) == 3
132 | 


--------------------------------------------------------------------------------
/tests/validate/test_cli.py:
--------------------------------------------------------------------------------
  1 | """Tests for CLI interface."""
  2 | 
  3 | import pytest
  4 | import json
  5 | import subprocess
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | 
 10 | class TestValidateCLI:
 11 |     """Tests for the validate CLI command."""
 12 | 
 13 |     def test_cli_validates_directory(self, tmp_path):
 14 |         """Test that CLI can validate a directory."""
 15 |         # Create a minimal dataset
 16 |         readme = tmp_path / "README.md"
 17 |         readme.write_text("""# Test Dataset
 18 | 
 19 | ## Background
 20 | Test background.
 21 | 
 22 | ## Methods
 23 | Test methods.
 24 | 
 25 | ## Data Description
 26 | Test data.
 27 | 
 28 | ## Usage Notes
 29 | Test usage.
 30 | 
 31 | ## References
 32 | Test references.
 33 | """)
 34 | 
 35 |         # Run CLI
 36 |         result = subprocess.run(
 37 |             [sys.executable, "-m", "physionet", "validate", str(tmp_path)],
 38 |             capture_output=True,
 39 |             text=True,
 40 |         )
 41 | 
 42 |         assert result.returncode == 0
 43 |         assert "PhysioNet Dataset Validation Report" in result.stdout
 44 | 
 45 |     def test_cli_handles_nonexistent_path(self):
 46 |         """Test that CLI handles nonexistent paths gracefully."""
 47 |         result = subprocess.run(
 48 |             [sys.executable, "-m", "physionet", "validate", "/nonexistent/path"],
 49 |             capture_output=True,
 50 |             text=True,
 51 |         )
 52 | 
 53 |         assert result.returncode == 1
 54 |         assert "does not exist" in result.stderr
 55 | 
 56 |     def test_cli_generates_json_report(self, tmp_path):
 57 |         """Test that CLI can generate JSON report."""
 58 |         # Create dataset
 59 |         readme = tmp_path / "README.md"
 60 |         readme.write_text("# Test")
 61 | 
 62 |         # Run CLI with --report
 63 |         report_file = tmp_path / "report.json"
 64 |         result = subprocess.run(
 65 |             [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--report", str(report_file)],
 66 |             capture_output=True,
 67 |             text=True,
 68 |         )
 69 | 
 70 |         # Check that report was created
 71 |         assert report_file.exists()
 72 | 
 73 |         # Validate JSON structure
 74 |         with open(report_file) as f:
 75 |             report = json.load(f)
 76 | 
 77 |         assert "dataset_path" in report
 78 |         assert "timestamp" in report
 79 |         assert "summary" in report
 80 |         assert "checks" in report
 81 | 
 82 |     def test_cli_filters_by_check_category(self, tmp_path):
 83 |         """Test that CLI can filter checks by category."""
 84 |         readme = tmp_path / "README.md"
 85 |         readme.write_text("# Test")
 86 | 
 87 |         result = subprocess.run(
 88 |             [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--checks", "filesystem"],
 89 |             capture_output=True,
 90 |             text=True,
 91 |         )
 92 | 
 93 |         assert result.returncode == 0
 94 |         # Should only show filesystem checks
 95 |         assert "Filesystem" in result.stdout or "filesystem" in result.stdout.lower()
 96 | 
 97 |     def test_cli_exits_with_error_on_validation_failure(self, tmp_path):
 98 |         """Test that CLI exits with error code when validation fails."""
 99 |         # Create dataset with PHI
100 |         csv_file = tmp_path / "data.csv"
101 |         csv_file.write_text("patient_id,email\n1,test@example.com\n")
102 | 
103 |         result = subprocess.run(
104 |             [sys.executable, "-m", "physionet", "validate", str(tmp_path)],
105 |             capture_output=True,
106 |             text=True,
107 |         )
108 | 
109 |         # Should exit with error code due to validation errors
110 |         assert result.returncode == 1
111 | 
112 |     def test_cli_shows_help(self):
113 |         """Test that CLI shows help message."""
114 |         result = subprocess.run(
115 |             [sys.executable, "-m", "physionet", "--help"],
116 |             capture_output=True,
117 |             text=True,
118 |         )
119 | 
120 |         assert result.returncode == 0
121 |         assert "validate" in result.stdout
122 | 
123 |     def test_validate_subcommand_help(self):
124 |         """Test that validate subcommand shows help."""
125 |         result = subprocess.run(
126 |             [sys.executable, "-m", "physionet", "validate", "--help"],
127 |             capture_output=True,
128 |             text=True,
129 |         )
130 | 
131 |         assert result.returncode == 0
132 |         assert "path" in result.stdout
133 |         assert "--report" in result.stdout
134 |         assert "--checks" in result.stdout
135 | 


--------------------------------------------------------------------------------
/physionet/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line interface for physionet package."""
  2 | 
  3 | import argparse
  4 | import json
  5 | import sys
  6 | from pathlib import Path
  7 | 
  8 | from physionet.validate import validate_dataset, ValidationConfig
  9 | 
 10 | 
 11 | def main():
 12 |     """Main entry point for the CLI."""
 13 |     parser = argparse.ArgumentParser(
 14 |         prog="physionet",
 15 |         description="Tools for working with PhysioNet datasets",
 16 |     )
 17 | 
 18 |     subparsers = parser.add_subparsers(dest="command", help="Available commands")
 19 | 
 20 |     # Validate subcommand
 21 |     validate_parser = subparsers.add_parser(
 22 |         "validate",
 23 |         help="Validate a dataset before submission to PhysioNet",
 24 |     )
 25 |     validate_parser.add_argument(
 26 |         "path",
 27 |         help="Path to the dataset directory to validate",
 28 |     )
 29 |     validate_parser.add_argument(
 30 |         "--report",
 31 |         metavar="FILE",
 32 |         help="Generate detailed JSON report and save to FILE",
 33 |     )
 34 |     validate_parser.add_argument(
 35 |         "--checks",
 36 |         metavar="CATEGORIES",
 37 |         help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)",
 38 |     )
 39 |     validate_parser.add_argument(
 40 |         "--level",
 41 |         choices=["error", "warning", "info"],
 42 |         default="info",
 43 |         help="Minimum severity level to display (default: info)",
 44 |     )
 45 |     validate_parser.add_argument(
 46 |         "--no-sampling",
 47 |         action="store_true",
 48 |         help="Disable sampling for large files (scan all rows, slower but more thorough)",
 49 |     )
 50 |     validate_parser.add_argument(
 51 |         "--max-rows",
 52 |         type=int,
 53 |         metavar="N",
 54 |         help="Maximum rows to scan per CSV file (default: 10000)",
 55 |     )
 56 | 
 57 |     args = parser.parse_args()
 58 | 
 59 |     if args.command == "validate":
 60 |         return _handle_validate(args)
 61 |     elif args.command is None:
 62 |         parser.print_help()
 63 |         return 0
 64 |     else:
 65 |         print(f"Unknown command: {args.command}", file=sys.stderr)
 66 |         return 1
 67 | 
 68 | 
 69 | def _handle_validate(args):
 70 |     """Handle the validate subcommand."""
 71 |     # Validate path
 72 |     dataset_path = Path(args.path)
 73 |     if not dataset_path.exists():
 74 |         print(f"Error: Path does not exist: {args.path}", file=sys.stderr)
 75 |         return 1
 76 | 
 77 |     if not dataset_path.is_dir():
 78 |         print(f"Error: Path is not a directory: {args.path}", file=sys.stderr)
 79 |         return 1
 80 | 
 81 |     # Configure validation
 82 |     config = ValidationConfig()
 83 | 
 84 |     # Parse check categories if specified
 85 |     if args.checks:
 86 |         categories = [c.strip().lower() for c in args.checks.split(",")]
 87 |         config.check_filesystem = "filesystem" in categories
 88 |         config.check_documentation = "documentation" in categories
 89 |         config.check_integrity = "integrity" in categories
 90 |         config.check_quality = "quality" in categories
 91 |         config.check_phi = "privacy" in categories
 92 | 
 93 |     # Configure sampling options
 94 |     if args.no_sampling:
 95 |         config.sample_large_files = False
 96 |     if args.max_rows:
 97 |         config.max_rows_to_scan = args.max_rows
 98 | 
 99 |     # Run validation
100 |     try:
101 |         print(f"Validating dataset: {dataset_path}")
102 |         result = validate_dataset(str(dataset_path), config, show_progress=True)
103 |         print()
104 | 
105 |         print(result.summary())
106 | 
107 |         # Save validation report - either to specified path or default location
108 |         if args.report:
109 |             report_path = Path(args.report)
110 |             # Determine format based on file extension
111 |             if report_path.suffix.lower() == '.json':
112 |                 # Save as JSON
113 |                 with open(report_path, "w", encoding="utf-8") as f:
114 |                     json.dump(result.to_dict(), f, indent=2)
115 |             else:
116 |                 # Save as Markdown
117 |                 with open(report_path, "w", encoding="utf-8") as f:
118 |                     f.write(result.summary())
119 |         else:
120 |             # Default: save as Markdown in the root of the dataset folder
121 |             report_path = dataset_path / "PHYSIONET_REPORT.md"
122 |             with open(report_path, "w", encoding="utf-8") as f:
123 |                 f.write(result.summary())
124 | 
125 |         print()
126 |         print(f"Validation report saved to: {report_path}")
127 | 
128 |         if result.status == "error":
129 |             return 1
130 |         elif result.status == "warning" and args.level == "error":
131 |             return 0  # Warnings don't fail if level is error
132 |         return 0
133 | 
134 |     except Exception as e:
135 |         print(f"Error during validation: {str(e)}", file=sys.stderr)
136 |         import traceback
137 |         traceback.print_exc()
138 |         return 1
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     sys.exit(main())
143 | 


--------------------------------------------------------------------------------
/physionet/validate/validator.py:
--------------------------------------------------------------------------------
  1 | """Main validation logic."""
  2 | 
  3 | import os
  4 | from datetime import datetime, timezone
  5 | from pathlib import Path
  6 | from typing import Optional
  7 | 
  8 | from tqdm import tqdm
  9 | 
 10 | from physionet.validate.config import ValidationConfig
 11 | from physionet.validate.models import (
 12 |     ValidationResult,
 13 |     CheckResult,
 14 |     ValidationIssue,
 15 |     CheckCategory,
 16 |     Severity,
 17 |     DatasetStats,
 18 | )
 19 | from physionet.validate.checks import (
 20 |     check_filesystem,
 21 |     check_documentation,
 22 |     check_integrity,
 23 |     check_quality,
 24 |     check_privacy,
 25 | )
 26 | 
 27 | 
 28 | def validate_dataset(
 29 |     dataset_path: str,
 30 |     config: Optional[ValidationConfig] = None,
 31 |     show_progress: bool = True
 32 | ) -> ValidationResult:
 33 |     """
 34 |     Validate a PhysioNet dataset before submission.
 35 | 
 36 |     Args:
 37 |         dataset_path: Path to the dataset directory
 38 |         config: Optional validation configuration. If None, uses defaults.
 39 |         show_progress: Whether to show progress bar. Default True.
 40 | 
 41 |     Returns:
 42 |         ValidationResult containing all validation issues and statistics
 43 | 
 44 |     Raises:
 45 |         ValueError: If dataset_path doesn't exist or isn't a directory
 46 |     """
 47 |     path = Path(dataset_path)
 48 |     if not path.exists():
 49 |         raise ValueError(f"Dataset path does not exist: {dataset_path}")
 50 |     if not path.is_dir():
 51 |         raise ValueError(f"Dataset path is not a directory: {dataset_path}")
 52 | 
 53 |     if config is None:
 54 |         config = ValidationConfig()
 55 | 
 56 |     # Initialize result
 57 |     result = ValidationResult(
 58 |         dataset_path=path.name,  # Use just the dataset folder name, not full path
 59 |         timestamp=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
 60 |     )
 61 | 
 62 |     # Calculate dataset statistics
 63 |     result.dataset_stats = _calculate_stats(path, config)
 64 | 
 65 |     # Determine which checks to run
 66 |     checks_to_run = []
 67 |     if config.check_filesystem:
 68 |         checks_to_run.append(("Filesystem", CheckCategory.FILESYSTEM, check_filesystem))
 69 |     if config.check_documentation:
 70 |         checks_to_run.append(("Documentation", CheckCategory.DOCUMENTATION, check_documentation))
 71 |     if config.check_integrity:
 72 |         checks_to_run.append(("Integrity", CheckCategory.INTEGRITY, check_integrity))
 73 |     if config.check_quality:
 74 |         checks_to_run.append(("Quality", CheckCategory.QUALITY, check_quality))
 75 |     if config.check_phi:
 76 |         checks_to_run.append(("Privacy", CheckCategory.PRIVACY, check_privacy))
 77 | 
 78 |     # Run validation checks with progress bar
 79 |     if show_progress:
 80 |         progress_bar = tqdm(
 81 |             total=100,
 82 |             desc="Running validation checks",
 83 |             unit="%",
 84 |             leave=False,
 85 |             ncols=100,
 86 |             bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}%"
 87 |         )
 88 | 
 89 |         steps_per_check = 100 // len(checks_to_run) if checks_to_run else 100
 90 | 
 91 |         for i, (name, category, check_func) in enumerate(checks_to_run):
 92 |             # Create a callback to update progress during this check
 93 |             def update_progress(msg: str):
 94 |                 progress_bar.set_description(f"{name}: {msg}"[:80])
 95 | 
 96 |             progress_bar.set_description(f"{name}"[:80])
 97 | 
 98 |             # Call check function with progress callback if it supports it
 99 |             try:
100 |                 result.check_results[category] = check_func(path, config, progress_callback=update_progress)
101 |             except TypeError:
102 |                 # Function doesn't support progress_callback parameter
103 |                 result.check_results[category] = check_func(path, config)
104 | 
105 |             # Update progress
106 |             progress_bar.update(steps_per_check)
107 | 
108 |         progress_bar.close()
109 |     else:
110 |         for name, category, check_func in checks_to_run:
111 |             # Try with progress_callback first, fall back to without
112 |             try:
113 |                 result.check_results[category] = check_func(path, config, progress_callback=None)
114 |             except TypeError:
115 |                 result.check_results[category] = check_func(path, config)
116 | 
117 |     return result
118 | 
119 | 
120 | def _calculate_stats(path: Path, config: ValidationConfig) -> DatasetStats:
121 |     """Calculate statistics about the dataset."""
122 |     stats = DatasetStats()
123 | 
124 |     for root, dirs, files in os.walk(path):
125 |         # Filter out ignored directories
126 |         dirs[:] = [d for d in dirs if not _should_ignore(d, config.ignore_patterns)]
127 | 
128 |         stats.directory_count += len(dirs)
129 | 
130 |         for file in files:
131 |             if _should_ignore(file, config.ignore_patterns):
132 |                 continue
133 | 
134 |             file_path = Path(root) / file
135 |             try:
136 |                 stats.file_count += 1
137 |                 stats.total_size_bytes += file_path.stat().st_size
138 |             except (OSError, PermissionError):
139 |                 # Skip files we can't access
140 |                 pass
141 | 
142 |     return stats
143 | 
144 | 
145 | def _should_ignore(name: str, patterns: list) -> bool:
146 |     """Check if a file or directory should be ignored."""
147 |     for pattern in patterns:
148 |         if pattern.startswith("*"):
149 |             if name.endswith(pattern[1:]):
150 |                 return True
151 |         elif pattern in name:
152 |             return True
153 |     return False
154 | 


--------------------------------------------------------------------------------
/tests/api/test_client.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import requests_mock
  3 | from physionet.api.client import PhysioNetClient
  4 | from physionet.api.exceptions import (
  5 |     BadRequestError,
  6 |     ForbiddenError,
  7 |     NotFoundError,
  8 |     RateLimitError,
  9 |     PhysioNetAPIError,
 10 | )
 11 | 
 12 | 
 13 | def test_client_initialization():
 14 |     """Test client initializes with correct defaults."""
 15 |     client = PhysioNetClient()
 16 | 
 17 |     assert client.base_url == "https://physionet.org"
 18 |     assert client.api_base == "https://physionet.org/api/v1/"
 19 |     assert client.timeout == 30
 20 |     assert "User-Agent" in client.session.headers
 21 |     assert client.session.headers["Accept"] == "application/json"
 22 | 
 23 | 
 24 | def test_client_initialization_with_custom_base_url():
 25 |     """Test client with custom base URL."""
 26 |     client = PhysioNetClient(base_url="https://test.example.com")
 27 | 
 28 |     assert client.base_url == "https://test.example.com"
 29 |     assert client.api_base == "https://test.example.com/api/v1/"
 30 | 
 31 | 
 32 | def test_client_initialization_with_trailing_slash():
 33 |     """Test that trailing slash is removed from base URL."""
 34 |     client = PhysioNetClient(base_url="https://physionet.org/")
 35 | 
 36 |     assert client.base_url == "https://physionet.org"
 37 | 
 38 | 
 39 | def test_client_initialization_with_auth():
 40 |     """Test client initializes with authentication."""
 41 |     client = PhysioNetClient(username="testuser", password="testpass")
 42 | 
 43 |     assert client.session.auth == ("testuser", "testpass")
 44 | 
 45 | 
 46 | def test_client_initialization_without_auth():
 47 |     """Test client initializes without authentication."""
 48 |     client = PhysioNetClient()
 49 | 
 50 |     assert client.session.auth is None
 51 | 
 52 | 
 53 | def test_client_has_projects_api():
 54 |     """Test client has projects API endpoint."""
 55 |     client = PhysioNetClient()
 56 | 
 57 |     assert hasattr(client, "projects")
 58 |     assert client.projects.client is client
 59 | 
 60 | 
 61 | def test_make_request_success():
 62 |     """Test successful API request."""
 63 |     client = PhysioNetClient()
 64 | 
 65 |     with requests_mock.Mocker() as m:
 66 |         m.get("https://physionet.org/api/v1/test/", json={"status": "ok"})
 67 | 
 68 |         response = client._make_request("GET", "test/")
 69 | 
 70 |         assert response.json() == {"status": "ok"}
 71 | 
 72 | 
 73 | def test_make_request_with_params():
 74 |     """Test API request with query parameters."""
 75 |     client = PhysioNetClient()
 76 | 
 77 |     with requests_mock.Mocker() as m:
 78 |         m.get("https://physionet.org/api/v1/test/", json={"status": "ok"})
 79 | 
 80 |         response = client._make_request("GET", "test/", params={"page": 1, "size": 10})
 81 | 
 82 |         assert "page=1" in m.last_request.url
 83 |         assert "size=10" in m.last_request.url
 84 | 
 85 | 
 86 | def test_error_handling_400():
 87 |     """Test 400 Bad Request error handling."""
 88 |     client = PhysioNetClient()
 89 | 
 90 |     with requests_mock.Mocker() as m:
 91 |         m.get("https://physionet.org/api/v1/test/", status_code=400, json={"error": "Bad request"})
 92 | 
 93 |         with pytest.raises(BadRequestError) as exc_info:
 94 |             client._make_request("GET", "test/")
 95 | 
 96 |         assert "Bad request" in str(exc_info.value)
 97 | 
 98 | 
 99 | def test_error_handling_403():
100 |     """Test 403 Forbidden error handling."""
101 |     client = PhysioNetClient()
102 | 
103 |     with requests_mock.Mocker() as m:
104 |         m.get("https://physionet.org/api/v1/test/", status_code=403, json={"error": "Forbidden"})
105 | 
106 |         with pytest.raises(ForbiddenError) as exc_info:
107 |             client._make_request("GET", "test/")
108 | 
109 |         assert "Forbidden" in str(exc_info.value)
110 | 
111 | 
112 | def test_error_handling_404():
113 |     """Test 404 Not Found error handling."""
114 |     client = PhysioNetClient()
115 | 
116 |     with requests_mock.Mocker() as m:
117 |         m.get("https://physionet.org/api/v1/test/", status_code=404, json={"error": "Not found"})
118 | 
119 |         with pytest.raises(NotFoundError) as exc_info:
120 |             client._make_request("GET", "test/")
121 | 
122 |         assert "Not found" in str(exc_info.value)
123 | 
124 | 
125 | def test_error_handling_429():
126 |     """Test 429 Rate Limit error handling."""
127 |     client = PhysioNetClient()
128 | 
129 |     with requests_mock.Mocker() as m:
130 |         m.get("https://physionet.org/api/v1/test/", status_code=429, json={"error": "Rate limit exceeded"})
131 | 
132 |         with pytest.raises(RateLimitError) as exc_info:
133 |             client._make_request("GET", "test/")
134 | 
135 |         assert "Rate limit exceeded" in str(exc_info.value)
136 | 
137 | 
138 | def test_error_handling_500():
139 |     """Test 500 Server Error handling."""
140 |     client = PhysioNetClient()
141 | 
142 |     with requests_mock.Mocker() as m:
143 |         m.get("https://physionet.org/api/v1/test/", status_code=500, json={"error": "Server error"})
144 | 
145 |         with pytest.raises(PhysioNetAPIError) as exc_info:
146 |             client._make_request("GET", "test/")
147 | 
148 |         assert "HTTP 500" in str(exc_info.value)
149 | 
150 | 
151 | def test_error_handling_non_json_response():
152 |     """Test error handling with non-JSON error response."""
153 |     client = PhysioNetClient()
154 | 
155 |     with requests_mock.Mocker() as m:
156 |         m.get("https://physionet.org/api/v1/test/", status_code=500, text="Internal Server Error")
157 | 
158 |         with pytest.raises(PhysioNetAPIError) as exc_info:
159 |             client._make_request("GET", "test/")
160 | 
161 |         assert "Internal Server Error" in str(exc_info.value)
162 | 
163 | 
164 | def test_context_manager():
165 |     """Test client works as context manager."""
166 |     with requests_mock.Mocker() as m:
167 |         m.get("https://physionet.org/api/v1/test/", json={"status": "ok"})
168 | 
169 |         with PhysioNetClient() as client:
170 |             assert client.session is not None
171 |             response = client._make_request("GET", "test/")
172 |             assert response.json() == {"status": "ok"}
173 | 
174 | 
175 | def test_close_method():
176 |     """Test close method closes session."""
177 |     client = PhysioNetClient()
178 |     assert client.session is not None
179 |     client.close()
180 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/quality.py:
--------------------------------------------------------------------------------
  1 | """Data quality validation checks."""
  2 | 
  3 | import csv
  4 | from pathlib import Path
  5 | from typing import Optional, Callable
  6 | 
  7 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
  8 | from physionet.validate.config import ValidationConfig
  9 | 
 10 | 
 11 | def check_quality(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
 12 |     """
 13 |     Check data quality.
 14 | 
 15 |     Validates:
 16 |     - Missing value thresholds
 17 |     - Value range plausibility
 18 |     - Data type consistency
 19 | 
 20 |     Args:
 21 |         path: Path to dataset directory
 22 |         config: Validation configuration
 23 |         progress_callback: Optional callback to report progress
 24 | 
 25 |     Returns:
 26 |         CheckResult with any quality issues found
 27 |     """
 28 |     result = CheckResult(category=CheckCategory.QUALITY)
 29 | 
 30 |     # Find and validate CSV files
 31 |     csv_files = list(path.rglob("*.csv"))
 32 |     for i, csv_file in enumerate(csv_files):
 33 |         if progress_callback:
 34 |             progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
 35 | 
 36 |         if any(p in str(csv_file) for p in config.ignore_patterns):
 37 |             continue
 38 | 
 39 |         _check_csv_quality(csv_file, path, config, result)
 40 | 
 41 |     return result
 42 | 
 43 | 
 44 | def _check_csv_quality(csv_file: Path, base_path: Path, config: ValidationConfig, result: CheckResult) -> None:
 45 |     """Check quality metrics for a CSV file."""
 46 |     try:
 47 |         with open(csv_file, "r", encoding="utf-8") as f:
 48 |             reader = csv.DictReader(f)
 49 | 
 50 |             # Track column statistics
 51 |             column_stats = {col: {"total": 0, "missing": 0, "values": []} for col in reader.fieldnames or []}
 52 | 
 53 |             # Determine if we should sample this file
 54 |             rows_scanned = 0
 55 |             max_rows = config.max_rows_to_scan
 56 | 
 57 |             # Sample if enabled and file is large
 58 |             if config.sample_large_files and max_rows:
 59 |                 all_rows = list(reader)
 60 |                 total_rows = len(all_rows)
 61 | 
 62 |                 if total_rows > max_rows:
 63 |                     # Sample evenly distributed rows
 64 |                     import random
 65 |                     random.seed(42)  # Deterministic sampling
 66 |                     step = total_rows / max_rows
 67 |                     sampled_indices = [int(i * step) for i in range(max_rows)]
 68 |                     rows_to_scan = [all_rows[i] for i in sampled_indices]
 69 |                 else:
 70 |                     rows_to_scan = all_rows
 71 |             else:
 72 |                 rows_to_scan = reader
 73 | 
 74 |             for row in rows_to_scan:
 75 |                 # Stop if we've hit the limit (when not sampling)
 76 |                 if max_rows and not config.sample_large_files and rows_scanned >= max_rows:
 77 |                     break
 78 |                 rows_scanned += 1
 79 | 
 80 |                 for col, value in row.items():
 81 |                     column_stats[col]["total"] += 1
 82 | 
 83 |                     # Check for missing values
 84 |                     if not value or value.strip() in ("", "NA", "N/A", "NULL", "null", "None", "NaN"):
 85 |                         column_stats[col]["missing"] += 1
 86 |                     else:
 87 |                         # Store value for range checking if configured
 88 |                         if col.lower().replace("_", " ") in [k.lower().replace("_", " ") for k in config.value_ranges]:
 89 |                             try:
 90 |                                 numeric_value = float(value.strip())
 91 |                                 column_stats[col]["values"].append(numeric_value)
 92 |                             except ValueError:
 93 |                                 pass
 94 | 
 95 |             # Analyze results
 96 |             for col, stats in column_stats.items():
 97 |                 if stats["total"] == 0:
 98 |                     continue
 99 | 
100 |                 # Check missing value threshold
101 |                 missing_ratio = stats["missing"] / stats["total"]
102 |                 if missing_ratio >= config.missing_value_threshold:
103 |                     result.issues.append(
104 |                         ValidationIssue(
105 |                             severity=Severity.WARNING,
106 |                             category=CheckCategory.QUALITY,
107 |                             file=str(csv_file.relative_to(base_path)),
108 |                             column=col,
109 |                             message=f"Column '{col}' is completely empty (100% missing values)",
110 |                             suggestion=f"Consider removing empty column '{col}' or adding data",
111 |                         )
112 |                     )
113 | 
114 |                 # Check value ranges
115 |                 for range_key, (min_val, max_val) in config.value_ranges.items():
116 |                     if col.lower().replace("_", " ") == range_key.lower().replace("_", " "):
117 |                         for value in stats["values"]:
118 |                             if value < min_val or value > max_val:
119 |                                 result.issues.append(
120 |                                     ValidationIssue(
121 |                                         severity=Severity.WARNING,
122 |                                         category=CheckCategory.QUALITY,
123 |                                         file=str(csv_file.relative_to(base_path)),
124 |                                         column=col,
125 |                                         value=str(value),
126 |                                         message=f"Value {value} in '{col}' outside expected range [{min_val}, {max_val}]",
127 |                                         suggestion="Verify data accuracy or adjust validation ranges",
128 |                                     )
129 |                                 )
130 |                                 # Limit warnings per column
131 |                                 break
132 | 
133 |     except Exception as e:
134 |         result.issues.append(
135 |             ValidationIssue(
136 |                 severity=Severity.WARNING,
137 |                 category=CheckCategory.QUALITY,
138 |                 file=str(csv_file.relative_to(base_path)),
139 |                 message=f"Could not perform quality checks: {str(e)}",
140 |             )
141 |         )
142 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/integrity.py:
--------------------------------------------------------------------------------
  1 | """Data integrity validation checks."""
  2 | 
  3 | import csv
  4 | from pathlib import Path
  5 | from typing import Optional, Callable
  6 | 
  7 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
  8 | from physionet.validate.config import ValidationConfig
  9 | 
 10 | 
 11 | def check_integrity(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
 12 |     """
 13 |     Check data integrity and format validation.
 14 | 
 15 |     Validates:
 16 |     - CSV file structure
 17 |     - File format validity
 18 |     - Basic structural consistency
 19 | 
 20 |     Args:
 21 |         path: Path to dataset directory
 22 |         config: Validation configuration
 23 |         progress_callback: Optional callback to report progress
 24 | 
 25 |     Returns:
 26 |         CheckResult with any integrity issues found
 27 |     """
 28 |     result = CheckResult(category=CheckCategory.INTEGRITY)
 29 | 
 30 |     # Find and validate CSV files
 31 |     csv_files = list(path.rglob("*.csv"))
 32 |     for i, csv_file in enumerate(csv_files):
 33 |         if progress_callback:
 34 |             progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
 35 | 
 36 |         if any(p in str(csv_file) for p in config.ignore_patterns):
 37 |             continue
 38 | 
 39 |         _validate_csv_structure(csv_file, path, result)
 40 | 
 41 |     return result
 42 | 
 43 | 
 44 | def _validate_csv_structure(csv_file: Path, base_path: Path, result: CheckResult) -> None:
 45 |     """Validate CSV file structure."""
 46 |     try:
 47 |         with open(csv_file, "r", encoding="utf-8") as f:
 48 |             # Try to detect dialect
 49 |             sample = f.read(1024)
 50 |             f.seek(0)
 51 | 
 52 |             try:
 53 |                 dialect = csv.Sniffer().sniff(sample)
 54 |             except csv.Error:
 55 |                 # Use default dialect if detection fails
 56 |                 dialect = csv.excel
 57 | 
 58 |             reader = csv.reader(f, dialect)
 59 | 
 60 |             # Read header
 61 |             try:
 62 |                 header = next(reader)
 63 |             except StopIteration:
 64 |                 result.issues.append(
 65 |                     ValidationIssue(
 66 |                         severity=Severity.ERROR,
 67 |                         category=CheckCategory.INTEGRITY,
 68 |                         file=str(csv_file.relative_to(base_path)),
 69 |                         message="CSV file is empty",
 70 |                     )
 71 |                 )
 72 |                 return
 73 | 
 74 |             if not header:
 75 |                 result.issues.append(
 76 |                     ValidationIssue(
 77 |                         severity=Severity.ERROR,
 78 |                         category=CheckCategory.INTEGRITY,
 79 |                         file=str(csv_file.relative_to(base_path)),
 80 |                         message="CSV file has no header row",
 81 |                     )
 82 |                 )
 83 |                 return
 84 | 
 85 |             # Check for duplicate column names
 86 |             if len(header) != len(set(header)):
 87 |                 duplicates = [col for col in header if header.count(col) > 1]
 88 |                 result.issues.append(
 89 |                     ValidationIssue(
 90 |                         severity=Severity.ERROR,
 91 |                         category=CheckCategory.INTEGRITY,
 92 |                         file=str(csv_file.relative_to(base_path)),
 93 |                         message=f"Duplicate column names found: {', '.join(set(duplicates))}",
 94 |                     )
 95 |                 )
 96 | 
 97 |             # Check for empty column names
 98 |             if any(not col.strip() for col in header):
 99 |                 result.issues.append(
100 |                     ValidationIssue(
101 |                         severity=Severity.ERROR,
102 |                         category=CheckCategory.INTEGRITY,
103 |                         file=str(csv_file.relative_to(base_path)),
104 |                         message="CSV contains empty column names",
105 |                     )
106 |                 )
107 | 
108 |             # Validate row consistency
109 |             expected_cols = len(header)
110 |             row_count = 0
111 |             for line_num, row in enumerate(reader, start=2):  # Start at 2 (after header)
112 |                 row_count += 1
113 |                 if len(row) != expected_cols:
114 |                     result.issues.append(
115 |                         ValidationIssue(
116 |                             severity=Severity.ERROR,
117 |                             category=CheckCategory.INTEGRITY,
118 |                             file=str(csv_file.relative_to(base_path)),
119 |                             line=line_num,
120 |                             message=f"Row has {len(row)} columns, expected {expected_cols}",
121 |                         )
122 |                     )
123 |                     # Only report first few inconsistencies to avoid spam
124 |                     if len([i for i in result.issues if i.file == str(csv_file.relative_to(base_path))]) >= 5:
125 |                         result.issues.append(
126 |                             ValidationIssue(
127 |                                 severity=Severity.INFO,
128 |                                 category=CheckCategory.INTEGRITY,
129 |                                 file=str(csv_file.relative_to(base_path)),
130 |                                 message=f"Additional row inconsistencies may exist (showing first 5)",
131 |                             )
132 |                         )
133 |                         break
134 | 
135 |             if row_count == 0:
136 |                 result.issues.append(
137 |                     ValidationIssue(
138 |                         severity=Severity.WARNING,
139 |                         category=CheckCategory.INTEGRITY,
140 |                         file=str(csv_file.relative_to(base_path)),
141 |                         message="CSV file contains only header row (no data)",
142 |                     )
143 |                 )
144 | 
145 |     except UnicodeDecodeError:
146 |         result.issues.append(
147 |             ValidationIssue(
148 |                 severity=Severity.ERROR,
149 |                 category=CheckCategory.INTEGRITY,
150 |                 file=str(csv_file.relative_to(base_path)),
151 |                 message="CSV file has encoding issues (not valid UTF-8)",
152 |                 suggestion="Convert file to UTF-8 encoding",
153 |             )
154 |         )
155 |     except Exception as e:
156 |         result.issues.append(
157 |             ValidationIssue(
158 |                 severity=Severity.WARNING,
159 |                 category=CheckCategory.INTEGRITY,
160 |                 file=str(csv_file.relative_to(base_path)),
161 |                 message=f"Could not validate CSV file: {str(e)}",
162 |             )
163 |         )
164 | 


--------------------------------------------------------------------------------
/tests/api/test_endpoints.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import requests_mock
  3 | from physionet.api.client import PhysioNetClient
  4 | from physionet.api.models import PublishedProject, ProjectVersion, ProjectDetail
  5 | 
  6 | 
  7 | @pytest.fixture
  8 | def client():
  9 |     """Fixture providing a PhysioNetClient instance."""
 10 |     return PhysioNetClient()
 11 | 
 12 | 
 13 | def test_list_published_basic(client):
 14 |     """Test listing published projects."""
 15 |     mock_response = [
 16 |         {
 17 |             "slug": "project-1",
 18 |             "version": "1.0",
 19 |             "title": "Project 1",
 20 |             "short_description": "Description 1",
 21 |             "abstract": "Abstract 1",
 22 |             "core_doi": "10.1234/p1",
 23 |             "version_doi": "10.1234/p1.v1",
 24 |             "is_latest_version": True,
 25 |             "publish_date": "2023-01-01",
 26 |             "license": {"name": "MIT"},
 27 |             "dua": None,
 28 |             "main_storage_size": 1000,
 29 |             "compressed_storage_size": 500,
 30 |         },
 31 |         {
 32 |             "slug": "project-2",
 33 |             "version": "2.0",
 34 |             "title": "Project 2",
 35 |             "short_description": "Description 2",
 36 |             "abstract": "Abstract 2",
 37 |             "core_doi": "10.1234/p2",
 38 |             "version_doi": "10.1234/p2.v2",
 39 |             "is_latest_version": True,
 40 |             "publish_date": "2023-02-01",
 41 |             "license": {"name": "GPL"},
 42 |             "dua": None,
 43 |             "main_storage_size": 2000,
 44 |             "compressed_storage_size": 1000,
 45 |         },
 46 |     ]
 47 | 
 48 |     with requests_mock.Mocker() as m:
 49 |         m.get("https://physionet.org/api/v1/projects/published/", json=mock_response)
 50 | 
 51 |         result = client.projects.list_published()
 52 | 
 53 |         assert len(result) == 2
 54 |         assert isinstance(result[0], PublishedProject)
 55 |         assert result[0].slug == "project-1"
 56 |         assert result[1].slug == "project-2"
 57 | 
 58 | 
 59 | def test_iter_published(client):
 60 |     """Test iterating through published projects."""
 61 |     mock_response = [
 62 |         {
 63 |             "slug": "project-1",
 64 |             "version": "1.0",
 65 |             "title": "Project 1",
 66 |             "short_description": "",
 67 |             "abstract": "",
 68 |             "core_doi": None,
 69 |             "version_doi": None,
 70 |             "is_latest_version": True,
 71 |             "publish_date": "",
 72 |             "license": None,
 73 |             "dua": None,
 74 |             "main_storage_size": 0,
 75 |             "compressed_storage_size": 0,
 76 |         },
 77 |         {
 78 |             "slug": "project-2",
 79 |             "version": "1.0",
 80 |             "title": "Project 2",
 81 |             "short_description": "",
 82 |             "abstract": "",
 83 |             "core_doi": None,
 84 |             "version_doi": None,
 85 |             "is_latest_version": True,
 86 |             "publish_date": "",
 87 |             "license": None,
 88 |             "dua": None,
 89 |             "main_storage_size": 0,
 90 |             "compressed_storage_size": 0,
 91 |         },
 92 |     ]
 93 | 
 94 |     with requests_mock.Mocker() as m:
 95 |         m.get("https://physionet.org/api/v1/projects/published/", json=mock_response)
 96 | 
 97 |         projects = list(client.projects.iter_published())
 98 | 
 99 |         assert len(projects) == 2
100 |         assert projects[0].slug == "project-1"
101 |         assert projects[1].slug == "project-2"
102 | 
103 | 
104 | def test_search_projects(client):
105 |     """Test searching for projects."""
106 |     mock_response = [
107 |         {
108 |             "slug": "ecg-project",
109 |             "version": "1.0",
110 |             "title": "ECG Database",
111 |             "short_description": "ECG data",
112 |             "abstract": "ECG abstract",
113 |             "core_doi": None,
114 |             "version_doi": None,
115 |             "is_latest_version": True,
116 |             "publish_date": "",
117 |             "license": None,
118 |             "dua": None,
119 |             "main_storage_size": 0,
120 |             "compressed_storage_size": 0,
121 |         }
122 |     ]
123 | 
124 |     with requests_mock.Mocker() as m:
125 |         m.get("https://physionet.org/api/v1/projects/search/", json=mock_response)
126 | 
127 |         results = client.projects.search(search_term="ECG", resource_type=["all"])
128 | 
129 |         assert "search_term=ECG" in m.last_request.url
130 |         assert len(results) == 1
131 |         assert isinstance(results[0], PublishedProject)
132 |         assert results[0].slug == "ecg-project"
133 | 
134 | 
135 | def test_list_versions(client):
136 |     """Test listing all versions of a project."""
137 |     mock_response = [
138 |         {
139 |             "slug": "test-project",
140 |             "title": "Test Project",
141 |             "version": "1.0",
142 |             "abstract": "Version 1.0",
143 |             "citation": "Citation v1.0",
144 |         },
145 |         {
146 |             "slug": "test-project",
147 |             "title": "Test Project",
148 |             "version": "2.0",
149 |             "abstract": "Version 2.0",
150 |             "citation": "Citation v2.0",
151 |         },
152 |     ]
153 | 
154 |     with requests_mock.Mocker() as m:
155 |         m.get("https://physionet.org/api/v1/projects/test-project/versions/", json=mock_response)
156 | 
157 |         versions = client.projects.list_versions("test-project")
158 | 
159 |         assert len(versions) == 2
160 |         assert isinstance(versions[0], ProjectVersion)
161 |         assert versions[0].version == "1.0"
162 |         assert versions[1].version == "2.0"
163 | 
164 | 
165 | def test_get_details(client):
166 |     """Test getting project details."""
167 |     mock_response = {
168 |         "slug": "test-project",
169 |         "title": "Test Project",
170 |         "version": "1.0",
171 |         "abstract": "Test abstract",
172 |         "license": {"name": "MIT"},
173 |         "short_description": "Short desc",
174 |         "project_home_page": "https://example.com",
175 |         "publish_datetime": "2023-01-01T00:00:00",
176 |         "doi": "10.1234/test",
177 |         "main_storage_size": 1000,
178 |         "compressed_storage_size": 500,
179 |     }
180 | 
181 |     with requests_mock.Mocker() as m:
182 |         m.get("https://physionet.org/api/v1/projects/test-project/versions/1.0/", json=mock_response)
183 | 
184 |         detail = client.projects.get_details("test-project", "1.0")
185 | 
186 |         assert isinstance(detail, ProjectDetail)
187 |         assert detail.slug == "test-project"
188 |         assert detail.version == "1.0"
189 |         assert detail.doi == "10.1234/test"
190 | 
191 | 
192 | def test_download_checksums(client, tmp_path):
193 |     """Test downloading checksums file."""
194 |     checksum_content = b"abc123 file1.txt\ndef456 file2.txt\n"
195 |     output_file = tmp_path / "checksums.txt"
196 | 
197 |     with requests_mock.Mocker() as m:
198 |         m.get(
199 |             "https://physionet.org/api/v1/projects/published/test-project/1.0/sha256sums/", content=checksum_content
200 |         )
201 | 
202 |         client.projects.download_checksums("test-project", "1.0", str(output_file))
203 | 
204 |         assert output_file.exists()
205 |         assert output_file.read_bytes() == checksum_content
206 | 


--------------------------------------------------------------------------------
/tests/validate/test_validator.py:
--------------------------------------------------------------------------------
  1 | """Tests for main validation functionality."""
  2 | 
  3 | import pytest
  4 | import tempfile
  5 | from pathlib import Path
  6 | 
  7 | from physionet.validate import validate_dataset, ValidationConfig
  8 | from physionet.validate.models import Severity, CheckCategory
  9 | 
 10 | 
 11 | class TestValidateDataset:
 12 |     """Tests for validate_dataset function."""
 13 | 
 14 |     def test_nonexistent_path_raises_error(self):
 15 |         """Test that validating a nonexistent path raises ValueError."""
 16 |         with pytest.raises(ValueError, match="does not exist"):
 17 |             validate_dataset("/nonexistent/path")
 18 | 
 19 |     def test_file_instead_of_directory_raises_error(self, tmp_path):
 20 |         """Test that validating a file instead of directory raises ValueError."""
 21 |         test_file = tmp_path / "test.txt"
 22 |         test_file.write_text("test")
 23 | 
 24 |         with pytest.raises(ValueError, match="not a directory"):
 25 |             validate_dataset(str(test_file))
 26 | 
 27 |     def test_empty_directory_validation(self, tmp_path):
 28 |         """Test validation of an empty directory."""
 29 |         result = validate_dataset(str(tmp_path))
 30 | 
 31 |         assert result.dataset_path == tmp_path.name
 32 |         assert result.timestamp is not None
 33 |         assert result.dataset_stats.file_count == 0
 34 |         assert result.dataset_stats.total_size_bytes == 0
 35 | 
 36 |         # Should have error for missing README.md
 37 |         assert result.total_errors == 1
 38 |         assert any("README.md" in str(issue.message) for issue in result.check_results[CheckCategory.DOCUMENTATION].issues)
 39 | 
 40 |     def test_minimal_valid_dataset(self, tmp_path):
 41 |         """Test validation of a minimal valid dataset."""
 42 |         # Create README and a simple CSV file
 43 |         (tmp_path / "README.md").write_text("# Test Dataset")
 44 |         csv_file = tmp_path / "data.csv"
 45 |         csv_file.write_text("id,value\n1,100\n2,200\n")
 46 | 
 47 |         result = validate_dataset(str(tmp_path))
 48 | 
 49 |         assert result.dataset_stats.file_count == 2
 50 |         assert result.total_errors == 0
 51 | 
 52 |     def test_validation_with_custom_config(self, tmp_path):
 53 |         """Test validation with custom configuration."""
 54 |         # Create a dataset with custom requirements
 55 |         readme = tmp_path / "README.md"
 56 |         readme.write_text("# Test")
 57 | 
 58 |         config = ValidationConfig(
 59 |             check_filesystem=True,
 60 |             check_documentation=False,  # Disable documentation checks
 61 |             check_integrity=False,
 62 |             check_quality=False,
 63 |             check_phi=False,
 64 |         )
 65 | 
 66 |         result = validate_dataset(str(tmp_path), config)
 67 | 
 68 |         # Should only have filesystem checks
 69 |         assert CheckCategory.FILESYSTEM in result.check_results
 70 |         assert CheckCategory.DOCUMENTATION not in result.check_results
 71 | 
 72 |     def test_validation_without_progress_bar(self, tmp_path):
 73 |         """Test validation with progress bar disabled."""
 74 |         readme = tmp_path / "README.md"
 75 |         readme.write_text("# Test")
 76 | 
 77 |         # Should not raise any errors with show_progress=False
 78 |         result = validate_dataset(str(tmp_path), show_progress=False)
 79 |         assert result.total_errors == 0
 80 | 
 81 | 
 82 | class TestValidationStats:
 83 |     """Tests for dataset statistics calculation."""
 84 | 
 85 |     def test_calculates_file_count(self, tmp_path):
 86 |         """Test that file count is calculated correctly."""
 87 |         (tmp_path / "README.md").write_text("# Test")
 88 |         (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
 89 |         (tmp_path / "subdir").mkdir()
 90 |         (tmp_path / "subdir" / "data2.csv").write_text("col1\n1\n")
 91 | 
 92 |         result = validate_dataset(str(tmp_path))
 93 | 
 94 |         assert result.dataset_stats.file_count == 3
 95 |         assert result.dataset_stats.directory_count == 1
 96 | 
 97 |     def test_calculates_total_size(self, tmp_path):
 98 |         """Test that total size is calculated correctly."""
 99 |         content = "x" * 1000
100 |         (tmp_path / "README.md").write_text(content)
101 | 
102 |         result = validate_dataset(str(tmp_path))
103 | 
104 |         assert result.dataset_stats.total_size_bytes >= 1000
105 | 
106 |     def test_ignores_specified_patterns(self, tmp_path):
107 |         """Test that ignored patterns are not counted in stats."""
108 |         (tmp_path / "README.md").write_text("# Test")
109 |         (tmp_path / ".git").mkdir()
110 |         (tmp_path / ".git" / "config").write_text("test")
111 | 
112 |         result = validate_dataset(str(tmp_path))
113 | 
114 |         # .git directory and its contents should be ignored
115 |         assert result.dataset_stats.file_count == 1
116 | 
117 | 
118 | class TestValidationResult:
119 |     """Tests for ValidationResult model."""
120 | 
121 |     def test_summary_format(self, tmp_path):
122 |         """Test that summary is properly formatted."""
123 |         (tmp_path / "README.md").write_text("# Test")
124 | 
125 |         result = validate_dataset(str(tmp_path))
126 |         summary = result.summary()
127 | 
128 |         assert "PhysioNet Dataset Validation Report" in summary
129 |         assert tmp_path.name in summary
130 |         assert "Summary:" in summary
131 |         assert "Metadata:" in summary
132 |         assert "Validation Results:" in summary
133 | 
134 |     def test_to_dict_format(self, tmp_path):
135 |         """Test that to_dict produces valid structure."""
136 |         (tmp_path / "README.md").write_text("# Test")
137 | 
138 |         result = validate_dataset(str(tmp_path))
139 |         result_dict = result.to_dict()
140 | 
141 |         assert "dataset_path" in result_dict
142 |         assert "timestamp" in result_dict
143 |         assert "dataset_stats" in result_dict
144 |         assert "summary" in result_dict
145 |         assert "checks" in result_dict
146 | 
147 |         assert result_dict["summary"]["total_errors"] == result.total_errors
148 |         assert result_dict["summary"]["total_warnings"] == result.total_warnings
149 | 
150 |     def test_recommendations_section(self, tmp_path):
151 |         """Test that recommendations section is included when there are issues."""
152 |         # Create files with issues to trigger recommendations
153 |         (tmp_path / "file with spaces.csv").write_text("col1,col2\n1,2\n")
154 |         (tmp_path / ".env").write_text("API_KEY=secret")
155 |         (tmp_path / "empty.txt").write_text("")
156 | 
157 |         result = validate_dataset(str(tmp_path))
158 |         summary = result.summary()
159 | 
160 |         # Should include recommendations section
161 |         assert "Recommendations:" in summary
162 |         assert "Replace spaces with underscores or hyphens" in summary
163 |         assert "Remove" in summary  # Various remove recommendations
164 | 
165 |     def test_large_dataset_recommendation(self, tmp_path):
166 |         """Test that large datasets get upload assistance recommendation."""
167 |         # Create README to avoid documentation errors
168 |         (tmp_path / "README.md").write_text("# Large Dataset")
169 | 
170 |         # Create a large file (simulated - we'll modify the stats)
171 |         (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
172 | 
173 |         result = validate_dataset(str(tmp_path))
174 | 
175 |         # Manually set large size for testing (>200GB)
176 |         result.dataset_stats.total_size_bytes = 250 * 1024 ** 3  # 250 GB
177 | 
178 |         summary = result.summary()
179 | 
180 |         # Should include contact recommendation for large datasets
181 |         assert "contact@physionet.org" in summary
182 |         assert "very large" in summary.lower()
183 |         assert "250" in summary  # Should show the size
184 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/filesystem.py:
--------------------------------------------------------------------------------
  1 | """File system validation checks."""
  2 | 
  3 | import os
  4 | from pathlib import Path
  5 | 
  6 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
  7 | from physionet.validate.config import ValidationConfig
  8 | 
  9 | # Proprietary file formats and their recommended open alternatives
 10 | PROPRIETARY_FORMATS = {
 11 |     '.mat': 'MATLAB format; consider .csv, .zarr, .parquet, or .npy instead',
 12 |     '.sas7bdat': 'SAS format; consider .csv or .parquet instead',
 13 |     '.dta': 'Stata format; consider .csv or .parquet instead',
 14 |     '.sav': 'SPSS format; consider .csv or .parquet instead',
 15 |     '.xlsx': 'Excel format; consider .csv instead',
 16 |     '.xls': 'Excel format; consider .csv instead',
 17 |     '.rds': 'R binary format; consider .csv or .parquet instead',
 18 |     '.rdata': 'R binary format; consider .csv or .parquet instead',
 19 |     '.ppt': 'PowerPoint format; consider .pdf instead',
 20 |     '.pptx': 'PowerPoint format; consider .pdf instead',
 21 | }
 22 | 
 23 | 
 24 | def check_filesystem(path: Path, config: ValidationConfig) -> CheckResult:
 25 |     """
 26 |     Check file system organization and structure.
 27 | 
 28 |     Validates:
 29 |     - File naming conventions
 30 |     - Presence of version control artifacts
 31 |     - File sizes
 32 |     - Small file count
 33 | 
 34 |     Args:
 35 |         path: Path to dataset directory
 36 |         config: Validation configuration
 37 | 
 38 |     Returns:
 39 |         CheckResult with any filesystem issues found
 40 |     """
 41 |     result = CheckResult(category=CheckCategory.FILESYSTEM)
 42 | 
 43 |     # Check for version control artifacts
 44 |     for pattern in [".git", ".svn", ".hg", "__pycache__", ".pytest_cache"]:
 45 |         found_paths = list(path.rglob(pattern))
 46 |         if found_paths:
 47 |             result.issues.append(
 48 |                 ValidationIssue(
 49 |                     severity=Severity.WARNING,
 50 |                     category=CheckCategory.FILESYSTEM,
 51 |                     message=f"Found version control/build artifacts: {pattern}",
 52 |                     suggestion=f"Remove {pattern} directories before submission",
 53 |                 )
 54 |             )
 55 | 
 56 |     # Check for hidden and temp files
 57 |     for root, dirs, files in os.walk(path):
 58 |         # Filter ignored directories
 59 |         dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)]
 60 | 
 61 |         for file in files:
 62 |             file_path = Path(root) / file
 63 | 
 64 |             # Skip ignored files
 65 |             if any(p in file for p in config.ignore_patterns):
 66 |                 continue
 67 | 
 68 |             # Check for hidden files (starting with .)
 69 |             if file.startswith(".") and file not in [".gitignore", ".gitattributes"]:
 70 |                 result.issues.append(
 71 |                     ValidationIssue(
 72 |                         severity=Severity.WARNING,
 73 |                         category=CheckCategory.FILESYSTEM,
 74 |                         file=str(file_path.relative_to(path)),
 75 |                         message=f"Hidden file found: {file}",
 76 |                         suggestion="Remove hidden files before submission",
 77 |                     )
 78 |                 )
 79 | 
 80 |             # Check for temp files
 81 |             if file.endswith(("~", ".tmp", ".bak", ".swp")):
 82 |                 result.issues.append(
 83 |                     ValidationIssue(
 84 |                         severity=Severity.WARNING,
 85 |                         category=CheckCategory.FILESYSTEM,
 86 |                         file=str(file_path.relative_to(path)),
 87 |                         message=f"Temporary file found: {file}",
 88 |                         suggestion="Remove temporary files before submission",
 89 |                     )
 90 |                 )
 91 | 
 92 |             # Check file size
 93 |             try:
 94 |                 size = file_path.stat().st_size
 95 |                 if size == 0:
 96 |                     result.issues.append(
 97 |                         ValidationIssue(
 98 |                             severity=Severity.WARNING,
 99 |                             category=CheckCategory.FILESYSTEM,
100 |                             file=str(file_path.relative_to(path)),
101 |                             message="Empty file (0 bytes)",
102 |                             suggestion="Remove empty files or add content",
103 |                         )
104 |                     )
105 |                 elif config.max_file_size_bytes and size > config.max_file_size_bytes:
106 |                     result.issues.append(
107 |                         ValidationIssue(
108 |                             severity=Severity.INFO,
109 |                             category=CheckCategory.FILESYSTEM,
110 |                             file=str(file_path.relative_to(path)),
111 |                             message=f"Large file: {_format_size(size)}",
112 |                             suggestion="Consider splitting or compressing large files",
113 |                         )
114 |                     )
115 |             except (OSError, PermissionError):
116 |                 pass
117 | 
118 |             # Check for excessively long filenames
119 |             # Most filesystems support 255 characters, but recommend shorter for compatibility
120 |             if len(file) > 255:
121 |                 result.issues.append(
122 |                     ValidationIssue(
123 |                         severity=Severity.ERROR,
124 |                         category=CheckCategory.FILESYSTEM,
125 |                         file=str(file_path.relative_to(path)),
126 |                         message=f"Filename exceeds maximum length ({len(file)} characters): {file[:50]}...",
127 |                         suggestion="Shorten filename to 255 characters or less",
128 |                     )
129 |                 )
130 |             elif len(file) > 100:
131 |                 result.issues.append(
132 |                     ValidationIssue(
133 |                         severity=Severity.WARNING,
134 |                         category=CheckCategory.FILESYSTEM,
135 |                         file=str(file_path.relative_to(path)),
136 |                         message=f"Filename is very long ({len(file)} characters): {file[:50]}...",
137 |                         suggestion="Consider shortening filename for better compatibility (recommended: under 100 characters)",
138 |                     )
139 |                 )
140 | 
141 |             # Check for spaces in filename
142 |             if " " in file:
143 |                 result.issues.append(
144 |                     ValidationIssue(
145 |                         severity=Severity.WARNING,
146 |                         category=CheckCategory.FILESYSTEM,
147 |                         file=str(file_path.relative_to(path)),
148 |                         message=f"Filename contains spaces: {file}",
149 |                         suggestion="Replace spaces with underscores or hyphens",
150 |                     )
151 |                 )
152 | 
153 |             # Check for invalid/awkward characters in filename
154 |             # Include path separators, quotes, and other problematic characters
155 |             invalid_chars = set('<>:"|?*/\\\'')
156 |             found_invalid = [char for char in invalid_chars if char in file]
157 | 
158 |             if found_invalid:
159 |                 char_list = ", ".join(f"'{char}'" for char in found_invalid)
160 |                 result.issues.append(
161 |                     ValidationIssue(
162 |                         severity=Severity.ERROR,
163 |                         category=CheckCategory.FILESYSTEM,
164 |                         file=str(file_path.relative_to(path)),
165 |                         message=f"Filename contains invalid characters ({char_list}): {file}",
166 |                         suggestion="Remove special characters from filename (use only letters, numbers, underscores, hyphens, and periods)",
167 |                     )
168 |                 )
169 | 
170 |             # Check for proprietary file formats
171 |             file_ext = "." + file.split(".")[-1] if "." in file else ""
172 |             file_ext_lower = file_ext.lower()
173 | 
174 |             if file_ext_lower in PROPRIETARY_FORMATS:
175 |                 result.issues.append(
176 |                     ValidationIssue(
177 |                         severity=Severity.WARNING,
178 |                         category=CheckCategory.FILESYSTEM,
179 |                         file=str(file_path.relative_to(path)),
180 |                         message=f"Proprietary file format detected: {file}",
181 |                         suggestion=f"{PROPRIETARY_FORMATS[file_ext_lower]}",
182 |                     )
183 |                 )
184 | 
185 |     return result
186 | 
187 | 
188 | def _format_size(size_bytes: int) -> str:
189 |     """Format byte size as human-readable string."""
190 |     for unit in ["B", "KB", "MB", "GB", "TB"]:
191 |         if size_bytes < 1024.0:
192 |             return f"{size_bytes:.1f} {unit}"
193 |         size_bytes /= 1024.0
194 |     return f"{size_bytes:.1f} PB"
195 | 


--------------------------------------------------------------------------------
/physionet/validate/models.py:
--------------------------------------------------------------------------------
  1 | """Data models for validation results."""
  2 | 
  3 | from dataclasses import dataclass, field
  4 | from typing import List, Optional, Dict, Any
  5 | from enum import Enum
  6 | from datetime import datetime
  7 | import textwrap
  8 | 
  9 | 
 10 | class Severity(Enum):
 11 |     """Severity levels for validation issues."""
 12 |     ERROR = "error"
 13 |     WARNING = "warning"
 14 |     INFO = "info"
 15 | 
 16 | 
 17 | class CheckCategory(Enum):
 18 |     """Categories of validation checks."""
 19 |     FILESYSTEM = "filesystem"
 20 |     DOCUMENTATION = "documentation"
 21 |     INTEGRITY = "integrity"
 22 |     QUALITY = "quality"
 23 |     PRIVACY = "privacy"
 24 | 
 25 | 
 26 | @dataclass
 27 | class ValidationIssue:
 28 |     """Represents a single validation issue."""
 29 |     severity: Severity
 30 |     category: CheckCategory
 31 |     message: str
 32 |     file: Optional[str] = None
 33 |     line: Optional[int] = None
 34 |     column: Optional[str] = None
 35 |     value: Optional[str] = None
 36 |     suggestion: Optional[str] = None
 37 | 
 38 |     def to_dict(self) -> Dict[str, Any]:
 39 |         """Convert issue to dictionary format."""
 40 |         result = {
 41 |             "severity": self.severity.value,
 42 |             "category": self.category.value,
 43 |             "message": self.message,
 44 |         }
 45 |         if self.file:
 46 |             result["file"] = self.file
 47 |         if self.line is not None:
 48 |             result["line"] = self.line
 49 |         if self.column:
 50 |             result["column"] = self.column
 51 |         if self.value:
 52 |             result["value"] = self.value
 53 |         if self.suggestion:
 54 |             result["suggestion"] = self.suggestion
 55 |         return result
 56 | 
 57 | 
 58 | @dataclass
 59 | class CheckResult:
 60 |     """Results from a specific category of checks."""
 61 |     category: CheckCategory
 62 |     issues: List[ValidationIssue] = field(default_factory=list)
 63 | 
 64 |     @property
 65 |     def status(self) -> str:
 66 |         """Get overall status for this check category."""
 67 |         if any(issue.severity == Severity.ERROR for issue in self.issues):
 68 |             return "error"
 69 |         elif any(issue.severity == Severity.WARNING for issue in self.issues):
 70 |             return "warning"
 71 |         return "pass"
 72 | 
 73 |     @property
 74 |     def error_count(self) -> int:
 75 |         """Count of errors in this category."""
 76 |         return sum(1 for issue in self.issues if issue.severity == Severity.ERROR)
 77 | 
 78 |     @property
 79 |     def warning_count(self) -> int:
 80 |         """Count of warnings in this category."""
 81 |         return sum(1 for issue in self.issues if issue.severity == Severity.WARNING)
 82 | 
 83 |     @property
 84 |     def info_count(self) -> int:
 85 |         """Count of info messages in this category."""
 86 |         return sum(1 for issue in self.issues if issue.severity == Severity.INFO)
 87 | 
 88 | 
 89 | @dataclass
 90 | class DatasetStats:
 91 |     """Statistics about the dataset being validated."""
 92 |     total_size_bytes: int = 0
 93 |     file_count: int = 0
 94 |     directory_count: int = 0
 95 | 
 96 | 
 97 | @dataclass
 98 | class ValidationResult:
 99 |     """Complete validation results for a dataset."""
100 |     dataset_path: str
101 |     timestamp: str
102 |     check_results: Dict[CheckCategory, CheckResult] = field(default_factory=dict)
103 |     dataset_stats: DatasetStats = field(default_factory=DatasetStats)
104 | 
105 |     @property
106 |     def total_errors(self) -> int:
107 |         """Total count of errors across all checks."""
108 |         return sum(result.error_count for result in self.check_results.values())
109 | 
110 |     @property
111 |     def total_warnings(self) -> int:
112 |         """Total count of warnings across all checks."""
113 |         return sum(result.warning_count for result in self.check_results.values())
114 | 
115 |     @property
116 |     def total_info(self) -> int:
117 |         """Total count of info messages across all checks."""
118 |         return sum(result.info_count for result in self.check_results.values())
119 | 
120 |     @property
121 |     def status(self) -> str:
122 |         """Overall validation status."""
123 |         if self.total_errors > 0:
124 |             return "error"
125 |         elif self.total_warnings > 0:
126 |             return "warning"
127 |         return "pass"
128 | 
129 |     def summary(self) -> str:
130 |         """Generate a human-readable summary."""
131 |         # Format timestamp as human-readable
132 |         try:
133 |             dt = datetime.fromisoformat(self.timestamp.replace('Z', '+00:00'))
134 |             formatted_timestamp = dt.strftime("%Y-%m-%d %H:%M:%S UTC")
135 |         except (ValueError, AttributeError):
136 |             formatted_timestamp = self.timestamp
137 | 
138 |         # Get package version
139 |         try:
140 |             import physionet
141 |             validator_version = physionet.__version__
142 |         except (ImportError, AttributeError):
143 |             validator_version = "unknown"
144 | 
145 |         lines = []
146 | 
147 |         # Section 1: Metadata
148 |         lines.extend([
149 |             "PhysioNet Dataset Validation Report",
150 |             "=" * 50,
151 |             "",
152 |             "Metadata:",
153 |             f"  Dataset: {self.dataset_path}",
154 |             f"  Validator version: {validator_version}",
155 |             f"  Timestamp: {formatted_timestamp}",
156 |             f"  Total size: {self._format_size(self.dataset_stats.total_size_bytes)} "
157 |             f"({self.dataset_stats.file_count} files)",
158 |             "",
159 |         ])
160 | 
161 |         # Section 2: Validation Results
162 |         lines.extend([
163 |             "Validation Results:",
164 |             "=" * 50,
165 |         ])
166 | 
167 |         first_category = True
168 |         for category, result in self.check_results.items():
169 |             # Add blank line before each category (except first)
170 |             if not first_category:
171 |                 lines.append("")
172 |             first_category = False
173 | 
174 |             # Only show ✗ for errors, ✓ for pass or warnings-only
175 |             status_icon = "✗" if result.error_count > 0 else "✓"
176 |             issue_summary = ""
177 |             if result.error_count or result.warning_count:
178 |                 parts = []
179 |                 if result.error_count:
180 |                     parts.append(f"{result.error_count} error{'s' if result.error_count != 1 else ''}")
181 |                 if result.warning_count:
182 |                     parts.append(f"{result.warning_count} warning{'s' if result.warning_count != 1 else ''}")
183 |                 issue_summary = f" ({', '.join(parts)})"
184 | 
185 |             lines.append(f"{status_icon} {category.value.replace('_', ' ').title()}{issue_summary}")
186 | 
187 |             for issue in result.issues:
188 |                 icon = "✗" if issue.severity == Severity.ERROR else "⚠"
189 |                 location = f" {issue.file}"
190 |                 if issue.line:
191 |                     location += f":{issue.line}"
192 |                 lines.append(f"  {icon}{location} - {issue.message}")
193 | 
194 |         lines.append("")
195 | 
196 |         # Section 3: Summary
197 |         lines.extend([
198 |             "Summary:",
199 |             "=" * 50,
200 |             f"{self.total_errors} error{'s' if self.total_errors != 1 else ''}, "
201 |             f"{self.total_warnings} warning{'s' if self.total_warnings != 1 else ''}",
202 |             "",
203 |         ])
204 | 
205 |         if self.status == "error":
206 |             lines.append("✗ Dataset has errors that must be fixed before submission")
207 |         elif self.status == "warning":
208 |             lines.append("⚠ Dataset has warnings that should be addressed before submission")
209 |         else:
210 |             lines.append("✓ Dataset passed validation")
211 | 
212 |         # Add recommendations section if there are issues
213 |         recommendations = self._generate_recommendations()
214 |         if recommendations:
215 |             lines.extend([
216 |                 "",
217 |                 "Recommendations:",
218 |                 "=" * 50,
219 |             ])
220 |             lines.extend(recommendations)
221 | 
222 |         # Add note about including validation report in submission
223 |         note_text = "Note: A validation report (PHYSIONET_REPORT.md) has been saved in your dataset folder. Please include this file in your final submission."
224 |         lines.append("")
225 |         lines.extend(self._wrap_text(note_text))
226 | 
227 |         # Add footer with package information
228 |         lines.extend([
229 |             "",
230 |             "=" * 50,
231 |             "This report was generated by the PhysioNet Python package.",
232 |             "Install: pip install physionet",
233 |             "Learn more: https://github.com/MIT-LCP/physionet",
234 |         ])
235 | 
236 |         return "\n".join(lines) + "\n"
237 | 
238 |     def to_dict(self) -> Dict[str, Any]:
239 |         """Convert validation result to dictionary format."""
240 |         return {
241 |             "dataset_path": self.dataset_path,
242 |             "timestamp": self.timestamp,
243 |             "dataset_stats": {
244 |                 "total_size_bytes": self.dataset_stats.total_size_bytes,
245 |                 "file_count": self.dataset_stats.file_count,
246 |                 "directory_count": self.dataset_stats.directory_count,
247 |             },
248 |             "summary": {
249 |                 "total_errors": self.total_errors,
250 |                 "total_warnings": self.total_warnings,
251 |                 "total_info": self.total_info,
252 |                 "status": self.status,
253 |             },
254 |             "checks": {
255 |                 category.value: {
256 |                     "status": result.status,
257 |                     "issues": [issue.to_dict() for issue in result.issues],
258 |                 }
259 |                 for category, result in self.check_results.items()
260 |             },
261 |         }
262 | 
263 |     def _generate_recommendations(self) -> List[str]:
264 |         """Generate actionable recommendations based on issues found."""
265 |         recommendations = []
266 | 
267 |         # Check for very large datasets (>200GB)
268 |         size_gb = self.dataset_stats.total_size_bytes / (1024 ** 3)
269 |         if size_gb > 200:
270 |             recommendations.append("\nDataset Size:")
271 |             large_dataset_text = (
272 |                 f"  ℹ  Your dataset is very large ({self._format_size(self.dataset_stats.total_size_bytes)}). "
273 |                 "If you need assistance uploading large files, please contact the PhysioNet team at contact@physionet.org"
274 |             )
275 |             recommendations.extend(self._wrap_text(large_dataset_text, indent="     "))
276 | 
277 |         # Collect unique suggestions from all issues
278 |         suggestions_by_category = {}
279 | 
280 |         for category, result in self.check_results.items():
281 |             category_suggestions = {}
282 | 
283 |             for issue in result.issues:
284 |                 if issue.suggestion:
285 |                     # Group by suggestion to avoid duplicates
286 |                     if issue.suggestion not in category_suggestions:
287 |                         category_suggestions[issue.suggestion] = {
288 |                             'severity': issue.severity,
289 |                             'count': 0
290 |                         }
291 |                     category_suggestions[issue.suggestion]['count'] += 1
292 | 
293 |             if category_suggestions:
294 |                 suggestions_by_category[category] = category_suggestions
295 | 
296 |         # Generate recommendations by category
297 |         for category, suggestions in suggestions_by_category.items():
298 |             if not suggestions:
299 |                 continue
300 | 
301 |             recommendations.append(f"\n{category.value.replace('_', ' ').title()}:")
302 | 
303 |             # Sort by severity (errors first) and then by count
304 |             sorted_suggestions = sorted(
305 |                 suggestions.items(),
306 |                 key=lambda x: (x[1]['severity'] != Severity.ERROR, -x[1]['count'])
307 |             )
308 | 
309 |             for suggestion, info in sorted_suggestions:
310 |                 count = info['count']
311 |                 icon = "✗" if info['severity'] == Severity.ERROR else "⚠"
312 |                 count_str = f" ({count} file{'s' if count != 1 else ''})" if count > 1 else ""
313 |                 suggestion_text = f"  {icon} {suggestion}{count_str}"
314 |                 # Wrap long suggestions
315 |                 wrapped = self._wrap_text(suggestion_text, indent="     ")
316 |                 recommendations.extend(wrapped)
317 | 
318 |         return recommendations
319 | 
320 |     @staticmethod
321 |     def _format_size(size_bytes: int) -> str:
322 |         """Format byte size as human-readable string."""
323 |         for unit in ["B", "KB", "MB", "GB", "TB"]:
324 |             if size_bytes < 1024.0:
325 |                 return f"{size_bytes:.1f} {unit}"
326 |             size_bytes /= 1024.0
327 |         return f"{size_bytes:.1f} PB"
328 | 
329 |     @staticmethod
330 |     def _wrap_text(text: str, width: int = 80, indent: str = "      ") -> List[str]:
331 |         """Wrap text to specified width with continuation indent."""
332 |         # Use textwrap to wrap the text
333 |         wrapped = textwrap.fill(text, width=width, subsequent_indent=indent)
334 |         return wrapped.split('\n')
335 | 


--------------------------------------------------------------------------------
/physionet/validate/checks/privacy.py:
--------------------------------------------------------------------------------
  1 | """Privacy and PHI validation checks."""
  2 | 
  3 | import csv
  4 | import os
  5 | import re
  6 | from pathlib import Path
  7 | from typing import Optional, Callable
  8 | 
  9 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity
 10 | from physionet.validate.config import ValidationConfig
 11 | 
 12 | # Pattern names for better error messages
 13 | PHI_PATTERN_NAMES = {
 14 |     r"\b\d{3}-\d{2}-\d{4}\b": "SSN",
 15 |     r"\b[\w\.-]+@[\w\.-]+\.\w+\b": "email address",
 16 |     r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b": "phone number",
 17 | }
 18 | 
 19 | # Sensitive configuration files that should not be included in datasets
 20 | SENSITIVE_FILES = {
 21 |     # API keys and credentials
 22 |     ".env": "environment variables (may contain API keys)",
 23 |     ".env.local": "local environment variables",
 24 |     ".env.production": "production environment variables",
 25 |     "credentials.json": "credential file",
 26 |     "secrets.json": "secrets file",
 27 |     "config.json": "configuration file (may contain credentials)",
 28 |     ".aws/credentials": "AWS credentials",
 29 |     ".aws/config": "AWS configuration",
 30 | 
 31 |     # SSH and certificates
 32 |     "id_rsa": "SSH private key",
 33 |     "id_dsa": "SSH private key",
 34 |     "id_ecdsa": "SSH private key",
 35 |     "id_ed25519": "SSH private key",
 36 |     ".pem": "private certificate/key",
 37 |     ".key": "private key",
 38 |     ".p12": "certificate file",
 39 |     ".pfx": "certificate file",
 40 | 
 41 |     # Database
 42 |     ".pgpass": "PostgreSQL password file",
 43 |     ".my.cnf": "MySQL configuration (may contain passwords)",
 44 | 
 45 |     # Other sensitive files
 46 |     ".netrc": "authentication credentials",
 47 |     ".htpasswd": "HTTP authentication",
 48 |     "docker-compose.override.yml": "Docker override (may contain secrets)",
 49 | }
 50 | 
 51 | 
 52 | def check_privacy(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult:
 53 |     """
 54 |     Check for potential privacy issues and PHI.
 55 | 
 56 |     Validates:
 57 |     - PHI pattern detection
 58 |     - Age de-identification
 59 |     - Sensitive configuration files (keys, credentials)
 60 |     - Date patterns
 61 | 
 62 |     Args:
 63 |         path: Path to dataset directory
 64 |         config: Validation configuration
 65 |         progress_callback: Optional callback to report progress
 66 | 
 67 |     Returns:
 68 |         CheckResult with any privacy issues found
 69 |     """
 70 |     result = CheckResult(category=CheckCategory.PRIVACY)
 71 | 
 72 |     # Check for sensitive configuration files
 73 |     if progress_callback:
 74 |         progress_callback("Checking for sensitive configuration files")
 75 |     _check_sensitive_files(path, config, result)
 76 | 
 77 |     # Compile PHI patterns with names
 78 |     pattern_info = [(re.compile(pattern), PHI_PATTERN_NAMES.get(pattern, "unknown pattern"))
 79 |                     for pattern in config.phi_patterns]
 80 | 
 81 |     # Check CSV files
 82 |     csv_files = list(path.rglob("*.csv"))
 83 |     for i, csv_file in enumerate(csv_files):
 84 |         if progress_callback:
 85 |             progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)")
 86 | 
 87 |         if any(p in str(csv_file) for p in config.ignore_patterns):
 88 |             continue
 89 | 
 90 |         _check_csv_privacy(csv_file, path, config, pattern_info, result)
 91 | 
 92 |     # Check text files for PHI
 93 |     text_files = list(path.rglob("*.txt"))
 94 |     for i, text_file in enumerate(text_files):
 95 |         if progress_callback:
 96 |             progress_callback(f"Checking {text_file.name} ({i+1}/{len(text_files)} text files)")
 97 | 
 98 |         if any(p in str(text_file) for p in config.ignore_patterns):
 99 |             continue
100 | 
101 |         _check_text_file_privacy(text_file, path, pattern_info, result, config)
102 | 
103 |     return result
104 | 
105 | 
106 | def _check_sensitive_files(path: Path, config: ValidationConfig, result: CheckResult) -> None:
107 |     """Check for sensitive configuration files that shouldn't be in the dataset."""
108 |     for root, dirs, files in os.walk(path):
109 |         # Filter out ignored directories
110 |         dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)]
111 | 
112 |         for file in files:
113 |             file_path = Path(root) / file
114 |             relative_path = str(file_path.relative_to(path))
115 | 
116 |             # Skip ignored files
117 |             if any(p in str(file_path) for p in config.ignore_patterns):
118 |                 continue
119 | 
120 |             # Check exact filename matches
121 |             if file in SENSITIVE_FILES:
122 |                 result.issues.append(
123 |                     ValidationIssue(
124 |                         severity=Severity.ERROR,
125 |                         category=CheckCategory.PRIVACY,
126 |                         file=relative_path,
127 |                         message=f"Sensitive file detected: {SENSITIVE_FILES[file]}",
128 |                         suggestion=f"Remove '{file}' from the dataset before submission",
129 |                     )
130 |                 )
131 |                 continue
132 | 
133 |             # Check file extensions for sensitive files
134 |             for sensitive_name, description in SENSITIVE_FILES.items():
135 |                 # Check if it's an extension pattern (starts with .)
136 |                 if sensitive_name.startswith(".") and "." in file:
137 |                     ext = "." + file.split(".")[-1]
138 |                     if ext == sensitive_name:
139 |                         result.issues.append(
140 |                             ValidationIssue(
141 |                                 severity=Severity.ERROR,
142 |                                 category=CheckCategory.PRIVACY,
143 |                                 file=relative_path,
144 |                                 message=f"Sensitive file detected: {description}",
145 |                                 suggestion=f"Remove '{file}' from the dataset before submission",
146 |                             )
147 |                         )
148 |                         break
149 | 
150 |             # Check for common patterns in filenames
151 |             lower_file = file.lower()
152 |             if any(keyword in lower_file for keyword in ["password", "secret", "token", "apikey", "api_key"]):
153 |                 result.issues.append(
154 |                     ValidationIssue(
155 |                         severity=Severity.WARNING,
156 |                         category=CheckCategory.PRIVACY,
157 |                         file=relative_path,
158 |                         message=f"File name suggests sensitive content: '{file}'",
159 |                         suggestion="Review file contents and remove if it contains credentials or keys",
160 |                     )
161 |                 )
162 | 
163 | 
164 | def _check_csv_privacy(
165 |     csv_file: Path,
166 |     base_path: Path,
167 |     config: ValidationConfig,
168 |     pattern_info: list,
169 |     result: CheckResult
170 | ) -> None:
171 |     """Check a CSV file for privacy issues."""
172 |     relative_path = str(csv_file.relative_to(base_path))
173 | 
174 |     # Track which columns have which types of issues (to report only once per column)
175 |     # Maps column name to the pattern name that matched
176 |     phi_columns = {}  # {column: pattern_name}
177 |     age_columns = set()  # Columns with age violations
178 | 
179 |     try:
180 |         with open(csv_file, "r", encoding="utf-8") as f:
181 |             reader = csv.DictReader(f)
182 | 
183 |             # Determine if we should sample this file
184 |             rows_scanned = 0
185 |             max_rows = config.max_rows_to_scan
186 | 
187 |             # Count total rows first if we're sampling
188 |             if config.sample_large_files and max_rows:
189 |                 # Read all rows into list to enable sampling
190 |                 all_rows = list(reader)
191 |                 total_rows = len(all_rows)
192 | 
193 |                 if total_rows > max_rows:
194 |                     # Sample evenly distributed rows
195 |                     import random
196 |                     random.seed(42)  # Deterministic sampling
197 |                     step = total_rows / max_rows
198 |                     sampled_indices = [int(i * step) for i in range(max_rows)]
199 |                     rows_to_scan = [all_rows[i] for i in sampled_indices]
200 |                     is_sampled = True
201 |                 else:
202 |                     rows_to_scan = all_rows
203 |                     is_sampled = False
204 |             else:
205 |                 # No sampling, but still respect max_rows limit
206 |                 rows_to_scan = reader
207 |                 is_sampled = False
208 | 
209 |             for line_num, row in enumerate(rows_to_scan, start=2):  # Start at 2 (after header)
210 |                 # Stop if we've hit the limit (when not sampling)
211 |                 if max_rows and not is_sampled and rows_scanned >= max_rows:
212 |                     break
213 |                 rows_scanned += 1
214 | 
215 |                 for col, value in row.items():
216 |                     if not value:
217 |                         continue
218 | 
219 |                     value_str = str(value).strip()
220 | 
221 |                     # Check for PHI patterns (only track if not already found in this column)
222 |                     if col not in phi_columns:
223 |                         for pattern, pattern_name in pattern_info:
224 |                             if pattern.search(value_str):
225 |                                 phi_columns[col] = pattern_name
226 |                                 break
227 | 
228 |                     # Check for age violations (only track if not already found in this column)
229 |                     if col not in age_columns and "age" in col.lower():
230 |                         try:
231 |                             age_value = float(value_str)
232 |                             if age_value > config.allowed_age_max:
233 |                                 age_columns.add(col)
234 |                         except ValueError:
235 |                             pass
236 | 
237 |         # Report one issue per column type with specific pattern info
238 |         for col, pattern_name in phi_columns.items():
239 |             result.issues.append(
240 |                 ValidationIssue(
241 |                     severity=Severity.WARNING,
242 |                     category=CheckCategory.PRIVACY,
243 |                     file=relative_path,
244 |                     column=col,
245 |                     message=f"Potential private information detected in column '{col}' (pattern: {pattern_name})",
246 |                     suggestion="Review and remove or de-identify sensitive information",
247 |                 )
248 |             )
249 | 
250 |         for col in age_columns:
251 |             result.issues.append(
252 |                 ValidationIssue(
253 |                     severity=Severity.WARNING,
254 |                     category=CheckCategory.PRIVACY,
255 |                     file=relative_path,
256 |                     column=col,
257 |                     message=f"Ages exceeding HIPAA limit of {config.allowed_age_max} found in column '{col}'",
258 |                     suggestion=f"De-identify ages >{config.allowed_age_max} (e.g., set to {config.allowed_age_max}+)",
259 |                 )
260 |             )
261 | 
262 |     except Exception as e:
263 |         result.issues.append(
264 |             ValidationIssue(
265 |                 severity=Severity.WARNING,
266 |                 category=CheckCategory.PRIVACY,
267 |                 file=str(csv_file.relative_to(base_path)),
268 |                 message=f"Could not perform privacy checks: {str(e)}",
269 |             )
270 |         )
271 | 
272 | 
273 | def _check_text_file_privacy(text_file: Path, base_path: Path, pattern_info: list, result: CheckResult, config: ValidationConfig) -> None:
274 |     """Check a text file for privacy issues."""
275 |     relative_path = str(text_file.relative_to(base_path))
276 |     detected_patterns = set()
277 | 
278 |     try:
279 |         with open(text_file, "r", encoding="utf-8") as f:
280 |             content = f.read()
281 | 
282 |             # Check for PHI patterns and track which ones are found
283 |             for line in content.split("\n"):
284 |                 for pattern, pattern_name in pattern_info:
285 |                     if pattern.search(line):
286 |                         detected_patterns.add(pattern_name)
287 | 
288 |         # Report once per file with specific patterns found
289 |         if detected_patterns:
290 |             patterns_str = ", ".join(sorted(detected_patterns))
291 |             result.issues.append(
292 |                 ValidationIssue(
293 |                     severity=Severity.WARNING,
294 |                     category=CheckCategory.PRIVACY,
295 |                     file=relative_path,
296 |                     message=f"Potential private information detected ({patterns_str})",
297 |                     suggestion="Review and remove or de-identify sensitive information",
298 |                 )
299 |             )
300 | 
301 |     except UnicodeDecodeError:
302 |         # Skip binary files
303 |         pass
304 |     except Exception as e:
305 |         result.issues.append(
306 |             ValidationIssue(
307 |                 severity=Severity.WARNING,
308 |                 category=CheckCategory.PRIVACY,
309 |                 file=str(text_file.relative_to(base_path)),
310 |                 message=f"Could not perform privacy checks: {str(e)}",
311 |             )
312 |         )
313 | 


--------------------------------------------------------------------------------
/tests/validate/test_checks.py:
--------------------------------------------------------------------------------
  1 | """Tests for individual validation checks."""
  2 | 
  3 | import pytest
  4 | import csv
  5 | from pathlib import Path
  6 | 
  7 | from physionet.validate import ValidationConfig
  8 | from physionet.validate.checks import (
  9 |     check_filesystem,
 10 |     check_documentation,
 11 |     check_integrity,
 12 |     check_quality,
 13 |     check_privacy,
 14 | )
 15 | from physionet.validate.models import Severity, CheckCategory
 16 | 
 17 | 
 18 | class TestFilesystemChecks:
 19 |     """Tests for filesystem validation checks."""
 20 | 
 21 |     def test_detects_git_directory(self, tmp_path):
 22 |         """Test that .git directories are detected."""
 23 |         (tmp_path / ".git").mkdir()
 24 |         (tmp_path / ".git" / "config").write_text("test")
 25 | 
 26 |         config = ValidationConfig()
 27 |         result = check_filesystem(tmp_path, config)
 28 | 
 29 |         assert any(".git" in issue.message for issue in result.issues)
 30 | 
 31 |     def test_detects_hidden_files(self, tmp_path):
 32 |         """Test that hidden files are detected."""
 33 |         (tmp_path / ".hidden").write_text("test")
 34 | 
 35 |         config = ValidationConfig()
 36 |         result = check_filesystem(tmp_path, config)
 37 | 
 38 |         assert any(issue.file and ".hidden" in issue.file for issue in result.issues)
 39 | 
 40 |     def test_detects_temp_files(self, tmp_path):
 41 |         """Test that temporary files are detected."""
 42 |         (tmp_path / "file.txt~").write_text("test")
 43 |         (tmp_path / "temp.tmp").write_text("test")
 44 | 
 45 |         config = ValidationConfig()
 46 |         result = check_filesystem(tmp_path, config)
 47 | 
 48 |         assert len(result.issues) >= 2
 49 | 
 50 |     def test_detects_empty_files(self, tmp_path):
 51 |         """Test that empty files are detected."""
 52 |         (tmp_path / "empty.txt").write_text("")
 53 | 
 54 |         config = ValidationConfig()
 55 |         result = check_filesystem(tmp_path, config)
 56 | 
 57 |         assert any("Empty file" in issue.message for issue in result.issues)
 58 | 
 59 |     def test_detects_invalid_filename_characters(self, tmp_path):
 60 |         """Test that invalid filename characters are detected."""
 61 |         # Note: This test might not work on all filesystems
 62 |         try:
 63 |             (tmp_path / "file<test>.txt").write_text("test")
 64 |             config = ValidationConfig()
 65 |             result = check_filesystem(tmp_path, config)
 66 |             assert any("invalid characters" in issue.message.lower() for issue in result.issues)
 67 |             # Should show which character was found
 68 |             assert any("<" in issue.message for issue in result.issues)
 69 |         except OSError:
 70 |             # Skip test if filesystem doesn't allow these characters
 71 |             pytest.skip("Filesystem doesn't support invalid characters in filenames")
 72 | 
 73 |     def test_detects_path_separators_in_filenames(self, tmp_path):
 74 |         """Test that path separators and other awkward characters are flagged."""
 75 |         # These characters should be caught even though they can't actually be in filenames on most systems
 76 |         # We test the validation logic by checking the character set
 77 |         from physionet.validate.checks.filesystem import check_filesystem
 78 | 
 79 |         # Create a file with a valid name for the actual test
 80 |         (tmp_path / "normalfile.txt").write_text("test")
 81 | 
 82 |         config = ValidationConfig()
 83 |         result = check_filesystem(tmp_path, config)
 84 | 
 85 |         # The check should flag files with /, \, quotes, etc if they could exist
 86 |         # Since we can't create such files, we verify the character set in the code includes them
 87 |         # This is tested indirectly through the previous test
 88 | 
 89 |     def test_detects_spaces_in_filenames(self, tmp_path):
 90 |         """Test that filenames with spaces are flagged."""
 91 |         (tmp_path / "my data file.csv").write_text("col1,col2\n1,2\n")
 92 |         (tmp_path / "analysis results.txt").write_text("test")
 93 | 
 94 |         config = ValidationConfig()
 95 |         result = check_filesystem(tmp_path, config)
 96 | 
 97 |         # Should warn about both files with spaces
 98 |         space_warnings = [
 99 |             issue for issue in result.issues
100 |             if "spaces" in issue.message.lower()
101 |         ]
102 |         assert len(space_warnings) == 2
103 |         assert any("my data file.csv" in issue.file for issue in space_warnings)
104 |         assert any("analysis results.txt" in issue.file for issue in space_warnings)
105 | 
106 |     def test_detects_long_filenames(self, tmp_path):
107 |         """Test that excessively long filenames are flagged."""
108 |         # Create a file with a very long name (120 characters total)
109 |         long_name = "a" * 116 + ".csv"  # 116 + 4 = 120 characters
110 |         (tmp_path / long_name).write_text("col1,col2\n1,2\n")
111 | 
112 |         config = ValidationConfig()
113 |         result = check_filesystem(tmp_path, config)
114 | 
115 |         # Should warn about long filename
116 |         long_warnings = [
117 |             issue for issue in result.issues
118 |             if "very long" in issue.message.lower()
119 |         ]
120 |         assert len(long_warnings) == 1
121 |         assert "120 characters" in long_warnings[0].message
122 | 
123 |     def test_detects_extremely_long_filenames(self, tmp_path):
124 |         """Test that filenames exceeding maximum length are errors."""
125 |         # Create a file with name exceeding 255 characters
126 |         extreme_name = "b" * 260 + ".csv"
127 |         try:
128 |             (tmp_path / extreme_name).write_text("col1,col2\n1,2\n")
129 | 
130 |             config = ValidationConfig()
131 |             result = check_filesystem(tmp_path, config)
132 | 
133 |             # Should error about exceeding maximum length
134 |             length_errors = [
135 |                 issue for issue in result.issues
136 |                 if "exceeds maximum length" in issue.message.lower()
137 |             ]
138 |             assert len(length_errors) == 1
139 |             assert "260 characters" in length_errors[0].message
140 |         except OSError:
141 |             # Skip test if filesystem doesn't support such long names
142 |             pytest.skip("Filesystem doesn't support filenames over 255 characters")
143 | 
144 |     def test_detects_proprietary_formats(self, tmp_path):
145 |         """Test that proprietary file formats are flagged."""
146 |         # Create files with proprietary formats
147 |         (tmp_path / "data.xlsx").write_text("test")
148 |         (tmp_path / "analysis.mat").write_text("test")
149 |         (tmp_path / "results.sas7bdat").write_text("test")
150 | 
151 |         config = ValidationConfig()
152 |         result = check_filesystem(tmp_path, config)
153 | 
154 |         # Should warn about proprietary data formats (not .docx which is allowed)
155 |         proprietary_warnings = [
156 |             issue for issue in result.issues
157 |             if "proprietary file format" in issue.message.lower()
158 |         ]
159 |         assert len(proprietary_warnings) == 3
160 | 
161 |         # Check that suggestions include alternatives
162 |         suggestions = [issue.suggestion for issue in proprietary_warnings]
163 |         assert any(".csv" in s or ".parquet" in s for s in suggestions)
164 |         assert any(".zarr" in s for s in suggestions)
165 | 
166 |     def test_allows_open_formats(self, tmp_path):
167 |         """Test that open file formats are not flagged."""
168 |         # Create files with open formats (including .docx which is now allowed)
169 |         (tmp_path / "README.md").write_text("# Test")
170 |         (tmp_path / "data.csv").write_text("col1,col2\n1,2\n")
171 |         (tmp_path / "signal.hdf5").write_text("test")
172 |         (tmp_path / "record.json").write_text("{}")
173 |         (tmp_path / "notes.txt").write_text("notes")
174 |         (tmp_path / "protocol.docx").write_text("test")  # .docx is now allowed
175 | 
176 |         config = ValidationConfig()
177 |         result = check_filesystem(tmp_path, config)
178 | 
179 |         # Should not warn about proprietary formats
180 |         proprietary_warnings = [
181 |             issue for issue in result.issues
182 |             if "proprietary file format" in issue.message.lower()
183 |         ]
184 |         assert len(proprietary_warnings) == 0
185 | 
186 | 
187 | class TestDocumentationChecks:
188 |     """Tests for documentation validation checks."""
189 | 
190 |     def test_readme_required_by_default(self, tmp_path):
191 |         """Test that README.md is required by default."""
192 |         config = ValidationConfig()
193 |         result = check_documentation(tmp_path, config)
194 | 
195 |         # Should have error for missing README.md
196 |         assert result.error_count == 1
197 |         assert any("README.md" in issue.message for issue in result.issues)
198 | 
199 |         # Should have helpful suggestion about minimum content
200 |         readme_issue = [issue for issue in result.issues if "README.md" in issue.message][0]
201 |         assert "title and a brief description" in readme_issue.suggestion
202 | 
203 |     def test_custom_required_files(self, tmp_path):
204 |         """Test that custom required files are validated."""
205 |         config = ValidationConfig(required_files=["README.md", "LICENSE"])
206 |         result = check_documentation(tmp_path, config)
207 | 
208 |         # Should have errors for both missing files
209 |         assert result.error_count == 2
210 |         assert any("README.md" in issue.message for issue in result.issues)
211 |         assert any("LICENSE" in issue.message for issue in result.issues)
212 | 
213 |     def test_required_file_exists(self, tmp_path):
214 |         """Test that existing required file passes validation."""
215 |         readme = tmp_path / "README.md"
216 |         readme.write_text("# Title\n\nSome content.")
217 | 
218 |         config = ValidationConfig(required_files=["README.md"])
219 |         result = check_documentation(tmp_path, config)
220 | 
221 |         # Should have no errors since README exists
222 |         assert result.error_count == 0
223 | 
224 | 
225 | class TestIntegrityChecks:
226 |     """Tests for data integrity validation checks."""
227 | 
228 |     def test_validates_valid_csv(self, tmp_path):
229 |         """Test that valid CSV passes validation."""
230 |         csv_file = tmp_path / "data.csv"
231 |         csv_file.write_text("col1,col2,col3\n1,2,3\n4,5,6\n")
232 | 
233 |         config = ValidationConfig()
234 |         result = check_integrity(tmp_path, config)
235 | 
236 |         assert result.error_count == 0
237 | 
238 |     def test_detects_empty_csv(self, tmp_path):
239 |         """Test that empty CSV is detected."""
240 |         csv_file = tmp_path / "data.csv"
241 |         csv_file.write_text("")
242 | 
243 |         config = ValidationConfig()
244 |         result = check_integrity(tmp_path, config)
245 | 
246 |         assert any("empty" in issue.message.lower() for issue in result.issues)
247 | 
248 |     def test_detects_duplicate_column_names(self, tmp_path):
249 |         """Test that duplicate column names are detected."""
250 |         csv_file = tmp_path / "data.csv"
251 |         csv_file.write_text("col1,col2,col1\n1,2,3\n")
252 | 
253 |         config = ValidationConfig()
254 |         result = check_integrity(tmp_path, config)
255 | 
256 |         assert any("Duplicate" in issue.message for issue in result.issues)
257 | 
258 |     def test_detects_inconsistent_row_length(self, tmp_path):
259 |         """Test that inconsistent row lengths are detected."""
260 |         csv_file = tmp_path / "data.csv"
261 |         csv_file.write_text("col1,col2,col3\n1,2,3\n4,5\n6,7,8,9\n")
262 | 
263 |         config = ValidationConfig()
264 |         result = check_integrity(tmp_path, config)
265 | 
266 |         # Should detect both short and long rows
267 |         assert result.error_count >= 2
268 | 
269 |     def test_detects_encoding_issues(self, tmp_path):
270 |         """Test that encoding issues are detected."""
271 |         csv_file = tmp_path / "data.csv"
272 |         # Write invalid UTF-8
273 |         csv_file.write_bytes(b"col1,col2\n1,\xff\xfe\n")
274 | 
275 |         config = ValidationConfig()
276 |         result = check_integrity(tmp_path, config)
277 | 
278 |         assert any("encoding" in issue.message.lower() for issue in result.issues)
279 | 
280 | 
281 | class TestQualityChecks:
282 |     """Tests for data quality validation checks."""
283 | 
284 |     def test_detects_completely_empty_columns(self, tmp_path):
285 |         """Test that completely empty columns (100% missing) are detected."""
286 |         csv_file = tmp_path / "data.csv"
287 |         # Create CSV with one column that's 100% empty
288 |         rows = ["col1,col2,col3\n"]
289 |         for i in range(10):
290 |             rows.append(f"{i},data,\n")
291 |         csv_file.write_text("".join(rows))
292 | 
293 |         config = ValidationConfig()
294 |         result = check_quality(tmp_path, config)
295 | 
296 |         # Should detect the empty column
297 |         assert any("empty" in issue.message.lower() and "col3" in issue.column for issue in result.issues)
298 | 
299 |     def test_partial_missing_values_not_flagged(self, tmp_path):
300 |         """Test that partially missing columns (e.g., 75%) are not flagged."""
301 |         csv_file = tmp_path / "data.csv"
302 |         # Create CSV with 75% missing values in a column
303 |         rows = ["col1,col2\n"]
304 |         for i in range(100):
305 |             if i < 75:
306 |                 rows.append("1,\n")
307 |             else:
308 |                 rows.append("1,2\n")
309 |         csv_file.write_text("".join(rows))
310 | 
311 |         config = ValidationConfig()
312 |         result = check_quality(tmp_path, config)
313 | 
314 |         # Should NOT flag col2 since it has some data (25%)
315 |         assert not any("col2" in str(issue.column) for issue in result.issues)
316 | 
317 |     def test_detects_out_of_range_values(self, tmp_path):
318 |         """Test that out-of-range values are detected."""
319 |         csv_file = tmp_path / "data.csv"
320 |         csv_file.write_text("heart_rate\n80\n350\n75\n")
321 | 
322 |         config = ValidationConfig(value_ranges={"heart_rate": (20, 300)})
323 |         result = check_quality(tmp_path, config)
324 | 
325 |         assert any("outside expected range" in issue.message for issue in result.issues)
326 | 
327 | 
328 | class TestPrivacyChecks:
329 |     """Tests for privacy validation checks."""
330 | 
331 |     def test_date_format_not_flagged(self, tmp_path):
332 |         """Test that date formats (YYYY-MM-DD) are not automatically flagged as PHI.
333 | 
334 |         Dates are commonly used in medical datasets as de-identified timestamps.
335 |         They should not be flagged without additional context.
336 |         """
337 |         csv_file = tmp_path / "data.csv"
338 |         csv_file.write_text("patient_id,admission_date\n1,2023-05-15\n2,2023-06-20\n")
339 | 
340 |         config = ValidationConfig()
341 |         result = check_privacy(tmp_path, config)
342 | 
343 |         # Dates alone should not be flagged
344 |         assert result.error_count == 0
345 | 
346 |     def test_detects_email_addresses(self, tmp_path):
347 |         """Test that email addresses are detected as PHI."""
348 |         csv_file = tmp_path / "data.csv"
349 |         csv_file.write_text("patient_id,contact\n1,patient@example.com\n2,test@test.com\n")
350 | 
351 |         config = ValidationConfig()
352 |         result = check_privacy(tmp_path, config)
353 | 
354 |         # Should have one warning for the 'contact' column with pattern type
355 |         assert result.warning_count == 1
356 |         assert any(
357 |             issue.severity == Severity.WARNING
358 |             and "contact" in str(issue.column)
359 |             and "email address" in issue.message
360 |             for issue in result.issues
361 |         )
362 | 
363 |     def test_detects_age_violations(self, tmp_path):
364 |         """Test that ages over limit are detected."""
365 |         csv_file = tmp_path / "data.csv"
366 |         csv_file.write_text("patient_id,age\n1,92\n2,95\n3,85\n")
367 | 
368 |         config = ValidationConfig(allowed_age_max=89)
369 |         result = check_privacy(tmp_path, config)
370 | 
371 |         # Should have one warning for the age column (consolidated)
372 |         age_violations = [
373 |             issue for issue in result.issues
374 |             if "age" in issue.message.lower() and issue.severity == Severity.WARNING
375 |         ]
376 |         assert len(age_violations) == 1
377 |         assert "age" in age_violations[0].column.lower()
378 | 
379 |     def test_text_files_checked_for_phi(self, tmp_path):
380 |         """Test that text files are checked for PHI patterns."""
381 |         text_file = tmp_path / "notes.txt"
382 |         text_file.write_text("Contact: test@example.com\nPhone: 555-123-4567")
383 | 
384 |         config = ValidationConfig()
385 |         result = check_privacy(tmp_path, config)
386 | 
387 |         # Should detect private information patterns in text files as a single consolidated warning with pattern types
388 |         assert result.warning_count >= 1
389 |         assert any(
390 |             "private information detected" in issue.message
391 |             and ("email address" in issue.message or "phone number" in issue.message)
392 |             for issue in result.issues
393 |         )
394 | 
395 |     def test_allows_year_only_dates(self, tmp_path):
396 |         """Test that year-only dates are allowed."""
397 |         csv_file = tmp_path / "data.csv"
398 |         csv_file.write_text("patient_id,year\n1,2023\n2,2024\n")
399 | 
400 |         config = ValidationConfig()
401 |         result = check_privacy(tmp_path, config)
402 | 
403 |         # Should not flag year-only as PHI
404 |         phi_issues = [
405 |             issue for issue in result.issues
406 |             if issue.severity == Severity.ERROR
407 |         ]
408 |         assert len(phi_issues) == 0
409 | 
410 |     def test_detects_sensitive_config_files(self, tmp_path):
411 |         """Test that sensitive configuration files are detected."""
412 |         # Create some sensitive files
413 |         (tmp_path / ".env").write_text("API_KEY=secret123")
414 |         (tmp_path / "credentials.json").write_text('{"key": "value"}')
415 |         (tmp_path / "id_rsa").write_text("-----BEGIN RSA PRIVATE KEY-----")
416 | 
417 |         config = ValidationConfig()
418 |         result = check_privacy(tmp_path, config)
419 | 
420 |         # Should detect all three sensitive files as errors
421 |         sensitive_file_errors = [
422 |             issue for issue in result.issues
423 |             if issue.severity == Severity.ERROR and "Sensitive file detected" in issue.message
424 |         ]
425 |         assert len(sensitive_file_errors) == 3
426 | 
427 |     def test_detects_files_with_sensitive_names(self, tmp_path):
428 |         """Test that files with sensitive keywords in names are flagged."""
429 |         (tmp_path / "my_api_key.txt").write_text("some data")
430 |         (tmp_path / "database_password.csv").write_text("col1\nval1")
431 | 
432 |         config = ValidationConfig()
433 |         result = check_privacy(tmp_path, config)
434 | 
435 |         # Should warn about files with sensitive keywords in names
436 |         keyword_warnings = [
437 |             issue for issue in result.issues
438 |             if issue.severity == Severity.WARNING and "name suggests sensitive content" in issue.message
439 |         ]
440 |         assert len(keyword_warnings) >= 2
441 | 
442 |     def test_detects_key_file_extensions(self, tmp_path):
443 |         """Test that private key file extensions are detected."""
444 |         (tmp_path / "server.pem").write_text("certificate")
445 |         (tmp_path / "private.key").write_text("key data")
446 | 
447 |         config = ValidationConfig()
448 |         result = check_privacy(tmp_path, config)
449 | 
450 |         # Should detect both key files
451 |         key_errors = [
452 |             issue for issue in result.issues
453 |             if issue.severity == Severity.ERROR
454 |         ]
455 |         assert len(key_errors) >= 2
456 | 
457 |     def test_sampling_large_files(self, tmp_path):
458 |         """Test that large files are sampled for performance."""
459 |         csv_file = tmp_path / "large.csv"
460 | 
461 |         # Create a file with more rows than the sampling limit
462 |         rows = ["patient_id,email\n"]
463 |         for i in range(15000):  # More than default max_rows_to_scan (10000)
464 |             rows.append(f"{i},test{i}@example.com\n")
465 |         csv_file.write_text("".join(rows))
466 | 
467 |         config = ValidationConfig(max_rows_to_scan=1000, sample_large_files=True)
468 |         result = check_privacy(tmp_path, config)
469 | 
470 |         # Should still detect the email pattern even with sampling
471 |         assert result.warning_count >= 1
472 |         assert any("email" in str(issue.column) for issue in result.issues)
473 | 


--------------------------------------------------------------------------------