├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── code_coverage.yaml │ ├── docs.yml │ ├── linter.yml │ ├── pypi_upload.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── benchmarking ├── README.md ├── airfoil │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── benchmarking_utils.py ├── breastcancer │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── checkerboard2x2 │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── checkerboard4x4 │ ├── data_manager.py │ ├── run.py │ └── visualisation.ipynb ├── classification_experiment_utils.py ├── concrete │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── creditcardfraud │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── diabetes │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── digitdataset │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── energy │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── fashionmnist │ ├── README.md │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── gaussianclouds │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── glass │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── mnist │ ├── data_manager.py │ ├── model.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── parkinsons │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── power │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── regression_experiment_utils.py ├── results_grid_analysis_ray_local.ipynb ├── seeds │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── striatum │ ├── data_manager.py │ ├── run.py │ └── visualisation.ipynb ├── synthclass1 │ ├── data_manager.py │ ├── run.py │ └── visualisation.ipynb ├── synthclass2 │ ├── data_manager.py │ ├── run.py │ └── visualisation.ipynb ├── synthclass3 │ ├── data_manager.py │ ├── run.py │ └── visualisation.ipynb ├── synthreg1 │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── synthreg2 │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── wine │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb └── yacht │ ├── data_manager.py │ ├── results.csv │ ├── run.py │ └── visualisation.ipynb ├── default_configs └── pyl_trainer_base_config.json ├── docs ├── Makefile ├── images │ └── pyrelational_overview.png ├── make.bat └── source │ ├── _static │ ├── data_indices_diagram.png │ └── theme.css │ ├── conf.py │ ├── index.rst │ ├── notes │ ├── activelearning.rst │ ├── al_pipeline.png │ ├── benchmark_datasets.rst │ ├── eval.png │ ├── installation.rst │ ├── performance_comparison.png │ ├── quick_start.rst │ ├── training.png │ ├── using_the_model_api.rst │ ├── using_your_own_data.rst │ └── using_your_own_strategy.rst │ └── reference │ ├── data.rst │ ├── datasets.rst │ ├── informativeness.rst │ ├── models.rst │ ├── oracles.rst │ ├── pipeline.rst │ └── strategies.rst ├── examples ├── demo │ ├── ensemble_uncertainty_classification.py │ ├── lightning_diversity_classification.py │ ├── lightning_diversity_regression.py │ ├── lightning_mixed_regression.py │ ├── lightning_representative_classification.py │ ├── mcdropout_uncertainty_classification.py │ ├── mcdropout_uncertainty_regression.py │ ├── model_badge.py │ ├── model_gaussianprocesses.py │ ├── scikit_estimator.py │ └── utils │ │ ├── datasets.py │ │ └── ml_models.py └── notebooks │ ├── gaussian_processes.ipynb │ └── introduction.ipynb ├── pyproject.toml ├── pyrelational ├── __init__.py ├── batch_mode_samplers │ ├── __init__.py │ └── _batch_mode_samplers.py ├── data_managers │ ├── __init__.py │ └── data_manager.py ├── datasets │ ├── __init__.py │ ├── base.py │ ├── benchmark_datamanager.py │ ├── classification │ │ ├── __init__.py │ │ ├── andrea_et_al.py │ │ ├── fashion_mnist.py │ │ ├── ksenia_et_al.py │ │ ├── mnist.py │ │ ├── scikit_learn.py │ │ ├── synthetic.py │ │ ├── uci.py │ │ └── utils.py │ ├── download_utils.py │ ├── drugcomb.py │ ├── regression │ │ ├── __init__.py │ │ ├── scikit_learn.py │ │ ├── synthetic.py │ │ ├── uci.py │ │ └── utils.py │ └── uci_datasets.py ├── informativeness │ ├── __init__.py │ ├── abstract_scorers.py │ ├── classification_scorers.py │ ├── decorators.py │ ├── regression_scorers.py │ └── task_agnostic_scorers.py ├── model_managers │ ├── __init__.py │ ├── abstract_model_manager.py │ ├── ensemble_model_manager.py │ ├── lightning_model_manager.py │ ├── mcdropout_model_manager.py │ └── model_utils.py ├── oracles │ ├── __init__.py │ ├── abstract_oracle.py │ └── benchmark_oracle.py ├── pipeline │ ├── __init__.py │ └── pipeline.py ├── strategies │ ├── __init__.py │ ├── abstract_strategy.py │ ├── classification │ │ ├── __init__.py │ │ ├── classification_strategy.py │ │ ├── entropy_classification_strategy.py │ │ ├── least_confidence_strategy.py │ │ ├── marginal_confidence_strategy.py │ │ └── ratio_confidence_strategy.py │ ├── regression │ │ ├── __init__.py │ │ ├── bald_strategy.py │ │ ├── expected_improvement_strategy.py │ │ ├── greedy_strategy.py │ │ ├── regression_strategy.py │ │ ├── thompson_sampling_strategy.py │ │ ├── upper_confidence_bound_strategy.py │ │ └── variance_reduction_strategy.py │ └── task_agnostic │ │ ├── __init__.py │ │ ├── random_acquisition_strategy.py │ │ ├── relative_distance_strategy.py │ │ └── representative_sampling_strategy.py ├── types.py └── version.py ├── requirements ├── base_requirements.txt ├── dev_requirements.txt └── doc_requirements.txt ├── setup.py └── tests ├── __init__.py ├── data_managers ├── __init__.py └── test_data_manager.py ├── datasets ├── __init__.py ├── test_benchmark_datamanager.py ├── test_classification_datasets.py ├── test_regression_datasets.py └── test_uci_datasets.py ├── informativeness ├── __init__.py └── test_informativeness_scores.py ├── model_managers ├── __init__.py ├── test_ensemble_model_manager.py ├── test_mc_dropout_model_manager.py └── test_model_managers.py ├── oracles ├── __init__.py └── test_oracles.py ├── pipeline ├── __init__.py └── test_pipeline.py ├── samplers └── test_samplers.py ├── strategies ├── __init__.py ├── _agnostic_strategy_test_cases.py ├── _classification_strategy_test_cases.py ├── _regression_strategy_test_cases.py └── test_strategies.py └── test_utils.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | select = C,E,F,W,B,B950,C950,E950,F950,W950,D,D950,I,I950,N,N950,O,O950,R,R950,S,S950,T,T950,U,U950,V,V950,X,X950,Z,Z950 4 | extend-ignore = D100 D104 E203 E701 W503 F401 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behaviour: 15 | 1. Operating system: 16 | 2. Python version: 17 | 3. pip freeze file 18 | 4. Run config and sample file. 19 | 20 | **Expected behaviour** 21 | A clear and concise description of what you expected to happen. 22 | 23 | 24 | **Additional context** 25 | Add any other context about the problem here. 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feature]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **For the title of this PR:** please follow the grammatical rules of a usual publication title, without capitalisation (except for the first letter). 2 | 3 | The title should NOT CONTAIN CODE: no dots, no parentheses, no backticks, no brackets, etc. It needs to be distinctive (not detailed) and succinct (not lengthy). 4 | Details of this PR will go in the description. **For the description of this PR:** please replace every line in curly brackets ( { like this } ) with an appropriate description following the guidance. 5 | 6 | Finally, **please remove this paragraph**. 7 | 8 | ## What is the goal of this PR? 9 | 10 | { In the form of a paragraph (only use bullet points if strictly necessary), please describe the goal of this PR, why they are valuable to achieve, and reference the related GitHub issues. This section will be automatically compiled into the release notes, so please: 11 | - describe the impact of the change in this PR to the _user_ of this repository (e.g. end user, contributor, developer). 12 | - describe the new product behaviour in _present tense_, and the old behaviour and how it's been changed in _past tense_. 13 | - Use the _Royal We_: _"We"_ made changes, not _"I"_ made changes. } 14 | 15 | ## What are the changes implemented in this PR? 16 | 17 | { Please explain what you implemented, why your changes are the best way to achieve the goal(s) above. Please describe every method, class and package, by explaining: 18 | - its responsibility, 19 | - how it's expected to behave, and 20 | - how it relates to the adjacent methods/classes/packages it interacts with. 21 | 22 | This would allow the reviewer to understand your intentions in the code much better. If you're adding new classes, make sure these explanations are also included in the class header comments. Last but not least, please reference the GitHub issues to be automatically closed, such like 'closes #number'. } 23 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Basic dependabot.yml file with 2 | # minimum configuration for two package managers 3 | 4 | version: 2 5 | updates: 6 | # Enable version updates for python 7 | - package-ecosystem: "pip" 8 | # Look for requirements files in the `requirements` directory 9 | directory: "requirements/" 10 | # Check for updates once a week 11 | schedule: 12 | interval: "daily" 13 | # Labels on pull requests for version updates only 14 | labels: 15 | - "ci" 16 | pull-request-branch-name: 17 | # Separate sections of the branch name with a hyphen 18 | # for example, `dependabot-npm_and_yarn-next_js-acorn-6.4.1` 19 | separator: "-" 20 | # Allow up to 5 open pull requests for pip dependencies 21 | open-pull-requests-limit: 5 22 | -------------------------------------------------------------------------------- /.github/workflows/code_coverage.yaml: -------------------------------------------------------------------------------- 1 | name: Pytest Coverage 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | live-test: 9 | name: Test 10 | runs-on: ubuntu-latest 11 | permissions: write-all 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v2 15 | - uses: actions/setup-python@v2 16 | with: 17 | python-version: "3.10" 18 | - name: Install basic dependencies 19 | run: | 20 | pip install --upgrade pip==21.3.0 21 | pip install pytest-cov 22 | pip install -r requirements/dev_requirements.txt 23 | - name: Build coverage file 24 | run: | 25 | python -m pytest --junitxml=pytest.xml --cov=pyrelational tests/ | tee pytest-coverage.txt 26 | 27 | - name: Pytest coverage comment 28 | id: coverageComment 29 | uses: MishaKav/pytest-coverage-comment@main 30 | with: 31 | pytest-coverage-path: pytest-coverage.txt 32 | junitxml-path: pytest.xml 33 | 34 | - name: Check the output coverage 35 | run: | 36 | echo "Coverage Percantage - ${{ steps.coverageComment.outputs.coverage }}" 37 | echo "Coverage Color - ${{ steps.coverageComment.outputs.color }}" 38 | echo "Coverage Html - ${{ steps.coverageComment.outputs.coverageHtml }}" 39 | echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}" 40 | echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}" 41 | echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}" 42 | echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}" 43 | echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}" 44 | echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}" 45 | echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}" 46 | 47 | - name: Create the Badge 48 | uses: schneegans/dynamic-badges-action@v1.0.0 49 | with: 50 | auth: ${{ secrets.GIST_TOKEN }} 51 | gistID: 99eba16a0a4fad7eadf98ef938afe38c 52 | filename: pytest-coverage-comment.json 53 | label: Test Coverage 54 | message: ${{ steps.coverageComment.outputs.coverage }} 55 | color: ${{ steps.coverageComment.outputs.color }} 56 | namedLogo: python 57 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | 11 | make_html: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up Python 3.10 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: "3.10" 20 | - name: Install main package 21 | run: | 22 | pip install -e . 23 | - name: Install internal dependencies 24 | run: | 25 | pip install sphinx 26 | pip install sphinx_rtd_theme 27 | - name: Build documentation 28 | run: | 29 | cd docs && make clean && make html 30 | -------------------------------------------------------------------------------- /.github/workflows/linter.yml: -------------------------------------------------------------------------------- 1 | name: lint-task 2 | # Run this workflow every time a new commit pushed to your repository 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | 8 | # Set the job key. The key is displayed as the job name 9 | # when a job name is not provided 10 | run-lint-test: 11 | runs-on: ubuntu-latest 12 | name: lint 13 | # Name the Job 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v2 17 | - uses: actions/setup-python@v2 18 | with: 19 | python-version: "3.10" 20 | - name: Install flake8 and plugins 21 | run: | 22 | pip install --upgrade pip==22.2 23 | python3 -m venv env 24 | source env/bin/activate 25 | pip install -r requirements/dev_requirements.txt 26 | - name: Run linter 27 | run: | 28 | source env/bin/activate 29 | flake8 --exclude env 30 | black . --exclude env --check 2>&1 >/dev/null 31 | - name: Run mypy 32 | run: | 33 | source env/bin/activate 34 | mypy pyrelational/ --allow-redefinition --disable-error-code import --disable-error-code no-untyped-call --disable-error-code no-redef --implicit-reexport --strict --install-types --non-interactive --ignore-missing-imports --follow-imports=silent 35 | - name: clean venv 36 | run: | 37 | rm -r env 38 | -------------------------------------------------------------------------------- /.github/workflows/pypi_upload.yml: -------------------------------------------------------------------------------- 1 | name: Upload to PIP 2 | 3 | # Controls when the action will run. 4 | on: 5 | # Triggers the workflow when a release is created 6 | release: 7 | types: [created] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 13 | jobs: 14 | # This workflow contains a single job called "upload" 15 | upload: 16 | # The type of runner that the job will run on 17 | runs-on: ubuntu-latest 18 | 19 | # Steps represent a sequence of tasks that will be executed as part of the job 20 | steps: 21 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 22 | - uses: actions/checkout@v2 23 | 24 | # Sets up python 25 | - uses: actions/setup-python@v2 26 | with: 27 | python-version: 3.8 28 | 29 | # Install dependencies 30 | - name: "Installs dependencies" 31 | run: | 32 | python3 -m pip install --upgrade pip 33 | python3 -m pip install setuptools wheel twine 34 | 35 | # Build and upload to PyPI 36 | - name: "Builds and uploads to PyPI" 37 | run: | 38 | python3 setup.py sdist bdist_wheel 39 | python3 -m twine upload dist/* 40 | env: 41 | TWINE_USERNAME: __token__ 42 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 43 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | # Run this workflow every time a new commit pushed to your repository 4 | on: 5 | pull_request: 6 | 7 | jobs: 8 | 9 | # Set the job key. The key is displayed as the job name 10 | # when a job name is not provided 11 | run-tests: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ["3.9", "3.10", "3.11", "3.12"] 16 | 17 | name: Tests 18 | # Name the Job 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v2 22 | - uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install basic dependencies 26 | run: | 27 | pip install --upgrade pip 28 | pip install -r requirements/dev_requirements.txt 29 | pip install pytest-cov 30 | - name: Test with pytest 31 | run: | 32 | python -m pytest --cache-clear --cov=pyrelational tests > pytest-coverage.txt 33 | - name: Print error 34 | if: failure() 35 | run: | 36 | cat pytest-coverage.txt 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .idea 3 | .vscode 4 | .DS_Store 5 | 6 | run_experiments.sh 7 | 8 | # Dev files 9 | deprecated/ 10 | examples/demo/experiment_logs/ 11 | experiment_logs/ 12 | test_data/ 13 | 14 | **/ray_benchmark_results/ 15 | 16 | # Checkpoints 17 | checkpoints/ 18 | 19 | # Ignoring MLRun generated files 20 | mlruns 21 | 22 | # Byte-compiled / optimized / DLL files 23 | __pycache__/ 24 | *.py[cod] 25 | *$py.class 26 | 27 | # C extensions 28 | *.so 29 | 30 | # Distribution / packaging 31 | .Python 32 | build/ 33 | develop-eggs/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | pip-wheel-metadata/ 45 | share/python-wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .nox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | *.py,cover 72 | .hypothesis/ 73 | .pytest_cache/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Django stuff: 80 | *.log 81 | local_settings.py 82 | db.sqlite3 83 | db.sqlite3-journal 84 | 85 | # Flask stuff: 86 | instance/ 87 | .webassets-cache 88 | 89 | # Scrapy stuff: 90 | .scrapy 91 | 92 | # Sphinx documentation 93 | docs/_build/ 94 | docs/build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | # Jupyter Notebook 100 | .ipynb_checkpoints 101 | 102 | # IPython 103 | profile_default/ 104 | ipython_config.py 105 | 106 | # pyenv 107 | .python-version 108 | 109 | # pipenv 110 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 111 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 112 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 113 | # install all needed dependencies. 114 | #Pipfile.lock 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # Lightning logs 154 | **/lightning_logs 155 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 # Keep the version for general utility hooks 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | exclude: examples/notebooks/ 8 | - id: trailing-whitespace 9 | - repo: local 10 | hooks: 11 | - id: black 12 | name: black 13 | entry: black 14 | language: system 15 | types: [python] 16 | - repo: local 17 | hooks: 18 | - id: flake8 19 | name: flake8 20 | entry: flake8 21 | language: system 22 | types: [python] 23 | files: ^pyrelational 24 | - repo: local 25 | hooks: 26 | - id: isort 27 | name: isort (python) 28 | entry: isort --profile=black 29 | language: system 30 | types: [python] 31 | - repo: local 32 | hooks: 33 | - id: mypy 34 | name: mypy 35 | entry: mypy 36 | language: system 37 | types: [python] 38 | exclude: | 39 | (?x)( 40 | ^tests/ | 41 | ^examples/ | 42 | ^docs/ 43 | ) 44 | args: [ 45 | --strict, 46 | --follow-imports=silent, 47 | --ignore-missing-imports, 48 | --allow-redefinition, 49 | --install-types, 50 | --non-interactive, 51 | --implicit-reexport, 52 | --allow-untyped-calls, 53 | --disable-error-code=no-redef, 54 | ] 55 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.10" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Install requirements 19 | python: 20 | install: 21 | - requirements: requirements/doc_requirements.txt 22 | -------------------------------------------------------------------------------- /benchmarking/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking overview 2 | 3 | The benchmark results are located within a subdirectory for each dataset in `results.csv`. These results are generated by a python script in the same folder, currently named `run.py`. Each run script is styled in a similar pattern containing a `trial` function which specifies the manner in which the PyRelationAL pipeline is constructed. Done so as we may want different model_managers, oracles, seeds, etc. to be run. It also defines settings for names, and resources to be used for the trials. 4 | 5 | For each experiment, the DataManager is defined in a separate script, specifying the train, val, test splits along with the initial labelled and unlabelled indices in the queryable (train) pool. 6 | 7 | In current benchmarks, we leverage [Ray Tune's job scheduling](https://docs.ray.io/en/latest/tune/index.html) to efficiently distribute and manage the running of the jobs across available hardware. Each job creates an individual "trial" with its results and other logs sent to the `results.csv`. After this, we can collate and analyze the trials at the trial or whole experiment level by reading these results files in the `visualisation.ipynb` notebook. 8 | 9 | # TL;DR 10 | 11 | To re-run a benchmark experiment with the same selection of strategies as in the paper: run the following command from the project repository 12 | ```bash 13 | python -m benchmarking..run 14 | ``` 15 | 16 | - If adding a strategy to an existing benchmark: adjust `run.py`, then run it with `python -m benchmarking//run`. 17 | - If adding a new dataset: add a `` folder under `benchmarking/` then add the necessary scripts underneath following examples on the repository. Most important in this case is the `data_manager.py` which will specify the train, validation, test, initial labelled, and initial unlabelled indices for the dataset. 18 | 19 | 20 | # Utilities 21 | We provide some genetic benchmarking utils in `benchmarking_utils.py` along with classification and regression specific utilities and model definitions in `classification_experiment_utils.py` and `regression_experiment_utils.py` respectively. 22 | 23 | - `benchmarking_utils.py`: Contains general utilities for processing the outputs of the ray benchmarks. 24 | - `classification_experiment_utils.py`: Provides utilities and model definitions specific to classification tasks, including data preprocessing, model training, and evaluation functions. It also contains utilities for quickly calling classification specific AL strategies and parameter spaces for the experiments. It may be useful to add to these when trying new strategies. 25 | - `regression_experiment_utils.py`: Offers utilities and model definitions tailored for regression tasks, covering data handling, model training, and performance evaluation. It also contains utilities for quickly calling regression specific AL strategies and parameter spaces for the experiments. It may be useful to add to these when trying new strategies. 26 | -------------------------------------------------------------------------------- /benchmarking/airfoil/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the airfoil dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIAirfoil 12 | 13 | 14 | def get_airfoil_data_manager() -> DataManager: 15 | ds = UCIAirfoil() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [1000, 100, 402]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/airfoil/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..regression_experiment_utils import ( 17 | GPR, 18 | EnsembleScikit, 19 | experiment_param_space, 20 | get_strategy_from_string, 21 | numpy_collate, 22 | ) 23 | from .data_manager import get_airfoil_data_manager 24 | 25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 29 | seed = config["seed"] 30 | set_all_seeds(seed) 31 | strategy = get_strategy_from_string(config["strategy"]) 32 | data_manager = get_airfoil_data_manager() 33 | model_config: Dict[str, Any] = {} 34 | trainer_config: Dict[str, Any] = {} 35 | model_manager: GPR = GPR(model_config, trainer_config) 36 | oracle = BenchmarkOracle() 37 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 38 | 39 | # Annotating data step by step until the trainset is fully annotated 40 | pipeline.run(num_annotate=1) 41 | print(pipeline) 42 | 43 | iteration_metrics = [] 44 | for i in range(len(pipeline.performances)): 45 | if "test_metric" in pipeline.performances[i]: 46 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 47 | 48 | iteration_metrics = np.array(iteration_metrics) 49 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 50 | 51 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 52 | 53 | 54 | if __name__ == "__main__": 55 | EXPERIMENT_NAME = "results" 56 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 57 | 58 | trial = tune.with_resources(trial, {"cpu": 2}) 59 | tuner = tune.Tuner( 60 | trial, 61 | tune_config=tune.TuneConfig(num_samples=1), 62 | param_space=experiment_param_space, 63 | run_config=RunConfig( 64 | name=EXPERIMENT_NAME, 65 | storage_path=STORAGE_PATH, 66 | ), 67 | ) 68 | results_grid = tuner.fit() 69 | results_df = process_results_grid(results_grid=results_grid) 70 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 71 | -------------------------------------------------------------------------------- /benchmarking/breastcancer/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Breast Cancer dataset 4 | """ 5 | 6 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 7 | 8 | import numpy as np 9 | import torch 10 | from numpy.typing import NDArray 11 | 12 | from pyrelational.data_managers import DataManager 13 | from pyrelational.datasets.classification.scikit_learn import BreastCancerDataset 14 | 15 | from ..classification_experiment_utils import ( 16 | make_class_stratified_train_val_test_split, 17 | pick_one_sample_per_class, 18 | ) 19 | 20 | 21 | def get_breastcancer_data_manager() -> DataManager: 22 | ds = BreastCancerDataset() 23 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 24 | 25 | return DataManager( 26 | ds, 27 | train_indices=train_indices, 28 | validation_indices=valid_indices, 29 | test_indices=test_indices, 30 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 31 | loader_batch_size="full", 32 | loader_collate_fn=numpy_collate, 33 | ) 34 | 35 | 36 | def numpy_collate( 37 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 38 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 39 | """Collate function for a Pytorch to Numpy DataLoader""" 40 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 41 | -------------------------------------------------------------------------------- /benchmarking/breastcancer/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_breastcancer_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_breastcancer_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 2}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/checkerboard2x2/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Checkerboard2x2 dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.ksenia_et_al import Checkerboard2x2Dataset 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_checkerboard2x2_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 20)) 26 | ds = Checkerboard2x2Dataset() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | 29 | return DataManager( 30 | ds, 31 | train_indices=train_indices, 32 | validation_indices=valid_indices, 33 | test_indices=test_indices, 34 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 35 | loader_batch_size="full", 36 | loader_collate_fn=numpy_collate, 37 | ) 38 | 39 | 40 | def numpy_collate( 41 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 42 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 43 | """Collate function for a Pytorch to Numpy DataLoader""" 44 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 45 | -------------------------------------------------------------------------------- /benchmarking/checkerboard2x2/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import auc 12 | from sklearn.neighbors import KNeighborsClassifier 13 | from sklearn.neural_network import MLPClassifier 14 | from sklearn.svm import SVC 15 | 16 | from pyrelational.oracles import BenchmarkOracle 17 | from pyrelational.pipeline import Pipeline 18 | 19 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 20 | from ..classification_experiment_utils import ( 21 | SKRFC, 22 | LogisticRegressor, 23 | experiment_param_space, 24 | get_strategy_from_string, 25 | ) 26 | from .data_manager import get_checkerboard2x2_data_manager 27 | 28 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 29 | 30 | 31 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 32 | seed = config["seed"] 33 | set_all_seeds(seed) 34 | strategy = get_strategy_from_string(config["strategy"]) 35 | data_manager = get_checkerboard2x2_data_manager() 36 | model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3} 37 | model_config = {} 38 | trainer_config: Dict[str, Any] = {} 39 | model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config) 40 | oracle = BenchmarkOracle() 41 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 42 | 43 | # Annotating data step by step until the trainset is fully annotated 44 | pipeline.run(num_annotate=1, num_iterations=150) 45 | print(pipeline) 46 | 47 | iteration_metrics = [] 48 | for i in range(len(pipeline.performances)): 49 | if "test_metric" in pipeline.performances[i]: 50 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 51 | 52 | iteration_metrics = np.array(iteration_metrics) 53 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 54 | 55 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 56 | 57 | 58 | if __name__ == "__main__": 59 | EXPERIMENT_NAME = "results" 60 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 61 | 62 | trial = tune.with_resources(trial, {"cpu": 4}) 63 | tuner = tune.Tuner( 64 | trial, 65 | tune_config=tune.TuneConfig(num_samples=1), 66 | param_space=experiment_param_space, 67 | run_config=RunConfig( 68 | name=EXPERIMENT_NAME, 69 | storage_path=STORAGE_PATH, 70 | ), 71 | ) 72 | results_grid = tuner.fit() 73 | results_df = process_results_grid(results_grid=results_grid) 74 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 75 | -------------------------------------------------------------------------------- /benchmarking/checkerboard4x4/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Checkerboard4x4 dataset 4 | """ 5 | 6 | import random 7 | import time 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 9 | 10 | import numpy as np 11 | import torch 12 | from numpy.typing import NDArray 13 | 14 | from pyrelational.data_managers import DataManager 15 | from pyrelational.datasets.classification.ksenia_et_al import Checkerboard4x4Dataset 16 | 17 | from ..classification_experiment_utils import ( 18 | make_class_stratified_train_val_test_split, 19 | pick_one_sample_per_class, 20 | ) 21 | 22 | 23 | def get_checkerboard4x4_data_manager() -> DataManager: 24 | # Add a random wait between 1 and 10 seconds to avoid race conditions 25 | # when creating the DataManager 26 | time.sleep(random.randint(1, 20)) 27 | 28 | ds = Checkerboard4x4Dataset() 29 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 30 | 31 | return DataManager( 32 | ds, 33 | train_indices=train_indices, 34 | validation_indices=valid_indices, 35 | test_indices=test_indices, 36 | # FIXME 37 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 38 | loader_batch_size="full", 39 | loader_collate_fn=numpy_collate, 40 | ) 41 | 42 | 43 | def numpy_collate( 44 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 45 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 46 | """Collate function for a Pytorch to Numpy DataLoader""" 47 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 48 | -------------------------------------------------------------------------------- /benchmarking/concrete/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the Concrete dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIConcrete 12 | 13 | 14 | def get_concrete_data_manager() -> DataManager: 15 | ds = UCIConcrete() 16 | print(len(ds)) 17 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [800, 30, 200]) 18 | train_indices = list(train_ds.indices) 19 | valid_indices = list(valid_ds.indices) 20 | test_indices = list(test_ds.indices) 21 | 22 | return DataManager( 23 | ds, 24 | train_indices=train_indices, 25 | validation_indices=valid_indices, 26 | test_indices=test_indices, 27 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 28 | loader_batch_size="full", 29 | loader_collate_fn=numpy_collate, 30 | ) 31 | 32 | 33 | def numpy_collate( 34 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 36 | """Collate function for a Pytorch to Numpy DataLoader""" 37 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 38 | -------------------------------------------------------------------------------- /benchmarking/concrete/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..regression_experiment_utils import ( 17 | EnsembleScikit, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | numpy_collate, 21 | ) 22 | from .data_manager import get_concrete_data_manager 23 | 24 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 25 | 26 | 27 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 28 | seed = config["seed"] 29 | set_all_seeds(seed) 30 | strategy = get_strategy_from_string(config["strategy"]) 31 | data_manager = get_concrete_data_manager() 32 | model_config: Dict[str, Any] = {"random_state": seed} 33 | trainer_config: Dict[str, Any] = {} 34 | model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config) 35 | oracle = BenchmarkOracle() 36 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 37 | 38 | # Annotating data step by step until the trainset is fully annotated 39 | pipeline.run(num_annotate=1) 40 | print(pipeline) 41 | 42 | iteration_metrics = [] 43 | for i in range(len(pipeline.performances)): 44 | if "test_metric" in pipeline.performances[i]: 45 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 46 | 47 | iteration_metrics = np.array(iteration_metrics) 48 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 49 | 50 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 51 | 52 | 53 | if __name__ == "__main__": 54 | EXPERIMENT_NAME = "results" 55 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 56 | 57 | trial = tune.with_resources(trial, {"cpu": 2}) 58 | tuner = tune.Tuner( 59 | trial, 60 | tune_config=tune.TuneConfig(num_samples=1), 61 | param_space=experiment_param_space, 62 | run_config=RunConfig( 63 | name=EXPERIMENT_NAME, 64 | storage_path=STORAGE_PATH, 65 | ), 66 | ) 67 | results_grid = tuner.fit() 68 | results_df = process_results_grid(results_grid=results_grid) 69 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 70 | -------------------------------------------------------------------------------- /benchmarking/creditcardfraud/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the creditcardfraud dataset 4 | """ 5 | 6 | import random 7 | import time 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 9 | 10 | import numpy as np 11 | import torch 12 | from numpy.typing import NDArray 13 | 14 | from pyrelational.data_managers import DataManager 15 | from pyrelational.datasets.classification.andrea_et_al import CreditCardDataset 16 | 17 | from ..classification_experiment_utils import ( 18 | make_class_stratified_train_val_test_split, 19 | pick_one_sample_per_class, 20 | ) 21 | 22 | 23 | def get_creditcard_data_manager() -> DataManager: 24 | # Add a random wait between 1 and 10 seconds to avoid race conditions 25 | # when creating the DataManager 26 | time.sleep(random.randint(1, 50)) 27 | 28 | ds = CreditCardDataset() 29 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 30 | 31 | return DataManager( 32 | ds, 33 | train_indices=train_indices, 34 | validation_indices=valid_indices, 35 | test_indices=test_indices, 36 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 37 | loader_batch_size="full", 38 | loader_collate_fn=numpy_collate, 39 | ) 40 | 41 | 42 | def numpy_collate( 43 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 44 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 45 | """Collate function for a Pytorch to Numpy DataLoader""" 46 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 47 | -------------------------------------------------------------------------------- /benchmarking/creditcardfraud/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_creditcard_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_creditcard_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1, num_iterations=250) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 2}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/diabetes/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the diabetes dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.scikit_learn import DiabetesDataset 12 | 13 | 14 | def get_diabetes_data_manager() -> DataManager: 15 | ds = DiabetesDataset() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [300, 42, 100]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 30, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/diabetes/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | from sklearn.neural_network import MLPRegressor 12 | 13 | from pyrelational.oracles import BenchmarkOracle 14 | from pyrelational.pipeline import Pipeline 15 | 16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 17 | from ..regression_experiment_utils import ( 18 | EnsembleScikit, 19 | experiment_param_space, 20 | get_strategy_from_string, 21 | numpy_collate, 22 | ) 23 | from .data_manager import get_diabetes_data_manager 24 | 25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 29 | seed = config["seed"] 30 | set_all_seeds(seed) 31 | strategy = get_strategy_from_string(config["strategy"]) 32 | data_manager = get_diabetes_data_manager() 33 | model_config: Dict[str, Any] = {"random_state": seed} 34 | trainer_config: Dict[str, Any] = {} 35 | model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config) 36 | oracle = BenchmarkOracle() 37 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 38 | 39 | # Annotating data step by step until the trainset is fully annotated 40 | pipeline.run(num_annotate=1) 41 | print(pipeline) 42 | 43 | iteration_metrics = [] 44 | for i in range(len(pipeline.performances)): 45 | if "test_metric" in pipeline.performances[i]: 46 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 47 | 48 | iteration_metrics = np.array(iteration_metrics) 49 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 50 | 51 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 52 | 53 | 54 | if __name__ == "__main__": 55 | EXPERIMENT_NAME = "results" 56 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 57 | 58 | trial = tune.with_resources(trial, {"cpu": 2}) 59 | tuner = tune.Tuner( 60 | trial, 61 | tune_config=tune.TuneConfig(num_samples=1), 62 | param_space=experiment_param_space, 63 | run_config=RunConfig( 64 | name=EXPERIMENT_NAME, 65 | storage_path=STORAGE_PATH, 66 | ), 67 | ) 68 | results_grid = tuner.fit() 69 | results_df = process_results_grid(results_grid=results_grid) 70 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 71 | -------------------------------------------------------------------------------- /benchmarking/digitdataset/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Digit dataset in Sklearn 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.scikit_learn import DigitDataset 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_digitdataset_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = DigitDataset() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | return DataManager( 29 | ds, 30 | train_indices=train_indices, 31 | validation_indices=valid_indices, 32 | test_indices=test_indices, 33 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 34 | loader_batch_size="full", 35 | loader_collate_fn=numpy_collate, 36 | ) 37 | 38 | 39 | def numpy_collate( 40 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 42 | """Collate function for a Pytorch to Numpy DataLoader""" 43 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 44 | -------------------------------------------------------------------------------- /benchmarking/digitdataset/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_digitdataset_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_digitdataset_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/energy/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the energy dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIEnergy 12 | 13 | 14 | def get_energy_data_manager() -> DataManager: 15 | ds = UCIEnergy() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 100, 268]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/energy/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.metrics import auc 10 | 11 | from pyrelational.oracles import BenchmarkOracle 12 | from pyrelational.pipeline import Pipeline 13 | 14 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 15 | from ..regression_experiment_utils import ( 16 | GPR, 17 | experiment_param_space, 18 | get_strategy_from_string, 19 | numpy_collate, 20 | ) 21 | from .data_manager import get_energy_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_energy_data_manager() 31 | model_config: Dict[str, Any] = {} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager: GPR = GPR(model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 2}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/fashionmnist/README.md: -------------------------------------------------------------------------------- 1 | The benchmark script may need to be run twice as errors might arise the first time when downloading the dataset and race conditions beginning with ray's attempt to download the dataset multiple times in parallel. 2 | -------------------------------------------------------------------------------- /benchmarking/fashionmnist/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Zalando FashionMNIST dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.fashion_mnist import FashionMNIST 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_fashionMnist_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = FashionMNIST() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | return DataManager( 29 | ds, 30 | train_indices=train_indices, 31 | validation_indices=valid_indices, 32 | test_indices=test_indices, 33 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 34 | loader_batch_size="full", 35 | loader_collate_fn=numpy_collate, 36 | ) 37 | 38 | 39 | def numpy_collate( 40 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 42 | """Collate function for a Pytorch to Numpy DataLoader""" 43 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 44 | -------------------------------------------------------------------------------- /benchmarking/fashionmnist/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_fashionMnist_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_fashionMnist_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=10, num_iterations=250) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/gaussianclouds/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the GaussianClouds dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.ksenia_et_al import GaussianCloudsDataset 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_gaussianclouds_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = GaussianCloudsDataset() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | 29 | return DataManager( 30 | ds, 31 | train_indices=train_indices, 32 | validation_indices=valid_indices, 33 | test_indices=test_indices, 34 | # FIXME 35 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 36 | loader_batch_size="full", 37 | loader_collate_fn=numpy_collate, 38 | ) 39 | 40 | 41 | def numpy_collate( 42 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 44 | """Collate function for a Pytorch to Numpy DataLoader""" 45 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 46 | -------------------------------------------------------------------------------- /benchmarking/gaussianclouds/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_gaussianclouds_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_gaussianclouds_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1, num_iterations=250) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/glass/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the glass dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.uci import UCIGlass 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_glass_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = UCIGlass() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | return DataManager( 29 | ds, 30 | train_indices=train_indices, 31 | validation_indices=valid_indices, 32 | test_indices=test_indices, 33 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 34 | loader_batch_size="full", 35 | loader_collate_fn=numpy_collate, 36 | ) 37 | 38 | 39 | def numpy_collate( 40 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 42 | """Collate function for a Pytorch to Numpy DataLoader""" 43 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 44 | -------------------------------------------------------------------------------- /benchmarking/glass/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_glass_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_glass_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/mnist/data_manager.py: -------------------------------------------------------------------------------- 1 | """Data manager for MNIST dataset. 2 | 3 | We follow the setup in the BatchBald paper: https://arxiv.org/abs/1906.08158. 4 | """ 5 | 6 | from sklearn.model_selection import train_test_split 7 | 8 | from pyrelational.data_managers import DataManager 9 | from pyrelational.datasets.classification import MNIST 10 | 11 | 12 | def get_mnist_datamanager( 13 | percentage_val: float = 0.1, 14 | labelled_size: int = 20, 15 | random_state: int = 42, 16 | data_dir: str = "/tmp/", 17 | ) -> DataManager: 18 | """Instantiate data manager for MNIST dataset. 19 | 20 | :param percentage_val: size in percentage of the validation split, defaults to 0.1 21 | :param labelled_size: number of initial labelled sample, defaults to 20 22 | :param random_state: random seed, defaults to 42 23 | :param data_dir: directory where to download the data, defaults to "/tmp/" 24 | :return: MNIST pyrelational data manager. 25 | """ 26 | dataset = MNIST(data_dir=data_dir) 27 | train_ixs, test_ixs = dataset.data_splits[0] 28 | 29 | unlabelled_ixs, val_ixs = train_test_split( 30 | train_ixs, 31 | test_size=percentage_val, 32 | random_state=random_state, 33 | stratify=dataset.y[train_ixs], 34 | ) 35 | 36 | labelled_ixs, unlabelled_ixs = train_test_split( 37 | unlabelled_ixs, 38 | train_size=labelled_size, 39 | random_state=random_state, 40 | stratify=dataset.y[unlabelled_ixs], 41 | ) 42 | 43 | data_manager = DataManager( 44 | dataset=dataset, 45 | labelled_indices=labelled_ixs.tolist(), 46 | unlabelled_indices=unlabelled_ixs.tolist(), 47 | validation_indices=val_ixs.tolist(), 48 | test_indices=test_ixs.tolist(), 49 | ) 50 | return data_manager 51 | -------------------------------------------------------------------------------- /benchmarking/parkinsons/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Parkinsons dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.uci import UCIParkinsons 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_parkinsons_data_manager() -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = UCIParkinsons() 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | return DataManager( 29 | ds, 30 | train_indices=train_indices, 31 | validation_indices=valid_indices, 32 | test_indices=test_indices, 33 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 34 | loader_batch_size="full", 35 | loader_collate_fn=numpy_collate, 36 | ) 37 | 38 | 39 | def numpy_collate( 40 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 42 | """Collate function for a Pytorch to Numpy DataLoader""" 43 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 44 | -------------------------------------------------------------------------------- /benchmarking/parkinsons/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_parkinsons_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_parkinsons_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/power/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the Power dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIPower 12 | 13 | 14 | def get_power_data_manager() -> DataManager: 15 | ds = UCIPower() 16 | print(len(ds)) 17 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [7900, 100, 1568]) 18 | train_indices = list(train_ds.indices) 19 | valid_indices = list(valid_ds.indices) 20 | test_indices = list(test_ds.indices) 21 | 22 | return DataManager( 23 | ds, 24 | train_indices=train_indices, 25 | validation_indices=valid_indices, 26 | test_indices=test_indices, 27 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 28 | loader_batch_size="full", 29 | loader_collate_fn=numpy_collate, 30 | ) 31 | 32 | 33 | def numpy_collate( 34 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 36 | """Collate function for a Pytorch to Numpy DataLoader""" 37 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 38 | -------------------------------------------------------------------------------- /benchmarking/power/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | import random 4 | import time 5 | from typing import Any, Dict, Union 6 | 7 | import numpy as np 8 | from numpy.typing import NDArray 9 | from ray import tune 10 | from ray.train import RunConfig 11 | from sklearn.linear_model import ElasticNet 12 | from sklearn.metrics import auc 13 | 14 | from pyrelational.oracles import BenchmarkOracle 15 | from pyrelational.pipeline import Pipeline 16 | 17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 18 | from ..regression_experiment_utils import ( 19 | EnsembleScikit, 20 | experiment_param_space, 21 | get_strategy_from_string, 22 | numpy_collate, 23 | ) 24 | from .data_manager import get_power_data_manager 25 | 26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 27 | 28 | 29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 30 | time.sleep(random.uniform(40, 120)) 31 | seed = config["seed"] 32 | set_all_seeds(seed) 33 | strategy = get_strategy_from_string(config["strategy"]) 34 | data_manager = get_power_data_manager() 35 | model_config: Dict[str, Any] = {"random_state": seed} 36 | trainer_config: Dict[str, Any] = {} 37 | model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config) 38 | oracle = BenchmarkOracle() 39 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 40 | 41 | # Annotating data step by step until the trainset is fully annotated 42 | pipeline.run(num_annotate=1, num_iterations=500) 43 | print(pipeline) 44 | 45 | iteration_metrics = [] 46 | for i in range(len(pipeline.performances)): 47 | if "test_metric" in pipeline.performances[i]: 48 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 49 | 50 | iteration_metrics = np.array(iteration_metrics) 51 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 52 | 53 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 54 | 55 | 56 | if __name__ == "__main__": 57 | EXPERIMENT_NAME = "results" 58 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 59 | 60 | trial = tune.with_resources(trial, {"cpu": 2}) 61 | tuner = tune.Tuner( 62 | trial, 63 | tune_config=tune.TuneConfig(num_samples=1), 64 | param_space=experiment_param_space, 65 | run_config=RunConfig( 66 | name=EXPERIMENT_NAME, 67 | storage_path=STORAGE_PATH, 68 | ), 69 | ) 70 | results_grid = tuner.fit() 71 | results_df = process_results_grid(results_grid=results_grid) 72 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 73 | -------------------------------------------------------------------------------- /benchmarking/seeds/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the seeds dataset 4 | """ 5 | 6 | import random 7 | import time 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 9 | 10 | import numpy as np 11 | import torch 12 | from numpy.typing import NDArray 13 | 14 | from pyrelational.data_managers import DataManager 15 | from pyrelational.datasets.classification.uci import UCISeeds 16 | 17 | from ..classification_experiment_utils import ( 18 | make_class_stratified_train_val_test_split, 19 | pick_one_sample_per_class, 20 | ) 21 | 22 | 23 | def get_seeds_data_manager() -> DataManager: 24 | # Add a random wait between 1 and 10 seconds to avoid race conditions 25 | # when creating the DataManager 26 | time.sleep(random.randint(1, 10)) 27 | ds = UCISeeds() 28 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 29 | return DataManager( 30 | ds, 31 | train_indices=train_indices, 32 | validation_indices=valid_indices, 33 | test_indices=test_indices, 34 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 35 | loader_batch_size="full", 36 | loader_collate_fn=numpy_collate, 37 | ) 38 | 39 | 40 | def numpy_collate( 41 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 42 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 43 | """Collate function for a Pytorch to Numpy DataLoader""" 44 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 45 | -------------------------------------------------------------------------------- /benchmarking/seeds/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_seeds_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_seeds_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=1) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 2}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/striatum/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the Striatum dataset 4 | """ 5 | 6 | import random 7 | import time 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 9 | 10 | import numpy as np 11 | import torch 12 | from numpy.typing import NDArray 13 | 14 | from pyrelational.data_managers import DataManager 15 | from pyrelational.datasets.classification.ksenia_et_al import StriatumDataset 16 | 17 | from ..classification_experiment_utils import pick_one_sample_per_class 18 | 19 | 20 | def get_stratium_data_manager() -> DataManager: 21 | # Add a random wait between 1 and 10 seconds to avoid race conditions 22 | # when creating the DataManager 23 | time.sleep(random.randint(15, 50)) 24 | 25 | ds = StriatumDataset() 26 | 27 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [9900, 100, 10000]) 28 | train_indices = list(train_ds.indices) 29 | valid_indices = list(valid_ds.indices) 30 | test_indices = list(test_ds.indices) 31 | 32 | return DataManager( 33 | ds, 34 | train_indices=train_indices, 35 | validation_indices=valid_indices, 36 | test_indices=test_indices, 37 | # FIXME 38 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 39 | loader_batch_size="full", 40 | loader_collate_fn=numpy_collate, 41 | ) 42 | 43 | 44 | def numpy_collate( 45 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 46 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 47 | """Collate function for a Pytorch to Numpy DataLoader""" 48 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 49 | -------------------------------------------------------------------------------- /benchmarking/striatum/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..classification_experiment_utils import ( 17 | SKRFC, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | ) 21 | from .data_manager import get_stratium_data_manager 22 | 23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | 26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 27 | seed = config["seed"] 28 | set_all_seeds(seed) 29 | strategy = get_strategy_from_string(config["strategy"]) 30 | data_manager = get_stratium_data_manager() 31 | model_config = {"n_estimators": 10, "bootstrap": True} 32 | trainer_config: Dict[str, Any] = {} 33 | model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config) 34 | oracle = BenchmarkOracle() 35 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 36 | 37 | # Annotating data step by step until the trainset is fully annotated 38 | pipeline.run(num_annotate=10, num_iterations=250) 39 | print(pipeline) 40 | 41 | iteration_metrics = [] 42 | for i in range(len(pipeline.performances)): 43 | if "test_metric" in pipeline.performances[i]: 44 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 45 | 46 | iteration_metrics = np.array(iteration_metrics) 47 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 48 | 49 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 50 | 51 | 52 | if __name__ == "__main__": 53 | EXPERIMENT_NAME = "results" 54 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 55 | 56 | trial = tune.with_resources(trial, {"cpu": 4}) 57 | tuner = tune.Tuner( 58 | trial, 59 | tune_config=tune.TuneConfig(num_samples=1), 60 | param_space=experiment_param_space, 61 | run_config=RunConfig( 62 | name=EXPERIMENT_NAME, 63 | storage_path=STORAGE_PATH, 64 | ), 65 | ) 66 | results_grid = tuner.fit() 67 | results_df = process_results_grid(results_grid=results_grid) 68 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 69 | -------------------------------------------------------------------------------- /benchmarking/synthclass1/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the SynthClass1 dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.synthetic import SynthClass1 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_synthclass1_data_manager(seed: int) -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = SynthClass1(random_seed=seed) 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | 29 | return DataManager( 30 | ds, 31 | train_indices=train_indices, 32 | validation_indices=valid_indices, 33 | test_indices=test_indices, 34 | # FIXME 35 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 36 | loader_batch_size="full", 37 | loader_collate_fn=numpy_collate, 38 | ) 39 | 40 | 41 | def numpy_collate( 42 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 44 | """Collate function for a Pytorch to Numpy DataLoader""" 45 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 46 | -------------------------------------------------------------------------------- /benchmarking/synthclass1/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import auc 12 | from sklearn.neural_network import MLPClassifier 13 | 14 | from pyrelational.oracles import BenchmarkOracle 15 | from pyrelational.pipeline import Pipeline 16 | 17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 18 | from ..classification_experiment_utils import ( 19 | SKRFC, 20 | LogisticRegressor, 21 | experiment_param_space, 22 | get_strategy_from_string, 23 | ) 24 | from .data_manager import get_synthclass1_data_manager 25 | 26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 27 | 28 | 29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 30 | seed = config["seed"] 31 | set_all_seeds(seed) 32 | strategy = get_strategy_from_string(config["strategy"]) 33 | data_manager = get_synthclass1_data_manager(seed=seed) 34 | model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3} 35 | model_config = {"random_state": seed, "hidden_layer_sizes": (128, 64), "early_stopping": True} 36 | trainer_config: Dict[str, Any] = {} 37 | model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config) 38 | oracle = BenchmarkOracle() 39 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 40 | 41 | # Annotating data step by step until the trainset is fully annotated 42 | pipeline.run(num_annotate=1) 43 | print(pipeline) 44 | 45 | iteration_metrics = [] 46 | for i in range(len(pipeline.performances)): 47 | if "test_metric" in pipeline.performances[i]: 48 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 49 | 50 | iteration_metrics = np.array(iteration_metrics) 51 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 52 | 53 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 54 | 55 | 56 | if __name__ == "__main__": 57 | EXPERIMENT_NAME = "results" 58 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 59 | 60 | trial = tune.with_resources(trial, {"cpu": 4}) 61 | tuner = tune.Tuner( 62 | trial, 63 | tune_config=tune.TuneConfig(num_samples=1), 64 | param_space=experiment_param_space, 65 | run_config=RunConfig( 66 | name=EXPERIMENT_NAME, 67 | storage_path=STORAGE_PATH, 68 | ), 69 | ) 70 | results_grid = tuner.fit() 71 | results_df = process_results_grid(results_grid=results_grid) 72 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 73 | -------------------------------------------------------------------------------- /benchmarking/synthclass2/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the SynthClass2 dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.synthetic import SynthClass2 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_synthclass2_data_manager(seed: int) -> DataManager: 23 | # Add a random wait between 1 and 10 seconds to avoid race conditions 24 | # when creating the DataManager 25 | time.sleep(random.randint(1, 10)) 26 | ds = SynthClass2(random_seed=seed) 27 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 28 | 29 | return DataManager( 30 | ds, 31 | train_indices=train_indices, 32 | validation_indices=valid_indices, 33 | test_indices=test_indices, 34 | # FIXME 35 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 36 | loader_batch_size="full", 37 | loader_collate_fn=numpy_collate, 38 | ) 39 | 40 | 41 | def numpy_collate( 42 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 44 | """Collate function for a Pytorch to Numpy DataLoader""" 45 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 46 | -------------------------------------------------------------------------------- /benchmarking/synthclass2/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.ensemble import RandomForestClassifier 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import auc 12 | from sklearn.neural_network import MLPClassifier 13 | 14 | from pyrelational.oracles import BenchmarkOracle 15 | from pyrelational.pipeline import Pipeline 16 | 17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 18 | from ..classification_experiment_utils import ( 19 | SKRFC, 20 | LogisticRegressor, 21 | experiment_param_space, 22 | get_strategy_from_string, 23 | ) 24 | from .data_manager import get_synthclass2_data_manager 25 | 26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 27 | 28 | 29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]: 30 | seed = config["seed"] 31 | set_all_seeds(seed) 32 | strategy = get_strategy_from_string(config["strategy"]) 33 | data_manager = get_synthclass2_data_manager(seed=seed) 34 | model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3} 35 | model_config = {"random_state": seed, "hidden_layer_sizes": (128, 64), "early_stopping": True} 36 | trainer_config: Dict[str, Any] = {} 37 | model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config) 38 | oracle = BenchmarkOracle() 39 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 40 | 41 | # Annotating data step by step until the trainset is fully annotated 42 | pipeline.run(num_annotate=1) 43 | print(pipeline) 44 | 45 | iteration_metrics = [] 46 | for i in range(len(pipeline.performances)): 47 | if "test_metric" in pipeline.performances[i]: 48 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 49 | 50 | iteration_metrics = np.array(iteration_metrics) 51 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 52 | 53 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 54 | 55 | 56 | if __name__ == "__main__": 57 | EXPERIMENT_NAME = "results" 58 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 59 | 60 | trial = tune.with_resources(trial, {"cpu": 4}) 61 | tuner = tune.Tuner( 62 | trial, 63 | tune_config=tune.TuneConfig(num_samples=1), 64 | param_space=experiment_param_space, 65 | run_config=RunConfig( 66 | name=EXPERIMENT_NAME, 67 | storage_path=STORAGE_PATH, 68 | ), 69 | ) 70 | results_grid = tuner.fit() 71 | results_df = process_results_grid(results_grid=results_grid) 72 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 73 | -------------------------------------------------------------------------------- /benchmarking/synthclass3/data_manager.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | """Benchmarking DataManager for the SynthClass3 dataset 4 | """ 5 | import random 6 | import time 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 8 | 9 | import numpy as np 10 | import torch 11 | from numpy.typing import NDArray 12 | 13 | from pyrelational.data_managers import DataManager 14 | from pyrelational.datasets.classification.synthetic import SynthClass3 15 | 16 | from ..classification_experiment_utils import ( 17 | make_class_stratified_train_val_test_split, 18 | pick_one_sample_per_class, 19 | ) 20 | 21 | 22 | def get_synthclass3_data_manager(seed: int) -> DataManager: 23 | ds = SynthClass3(random_seed=seed) 24 | train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5) 25 | 26 | return DataManager( 27 | ds, 28 | train_indices=train_indices, 29 | validation_indices=valid_indices, 30 | test_indices=test_indices, 31 | # FIXME 32 | labelled_indices=pick_one_sample_per_class(ds, train_indices), 33 | loader_batch_size="full", 34 | loader_collate_fn=numpy_collate, 35 | ) 36 | 37 | 38 | def numpy_collate( 39 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 40 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 41 | """Collate function for a Pytorch to Numpy DataLoader""" 42 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 43 | -------------------------------------------------------------------------------- /benchmarking/synthreg1/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the SynthReg1 dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.synthetic import SynthReg1 12 | 13 | 14 | def get_synthreg1_data_manager() -> DataManager: 15 | ds = SynthReg1() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 50, 550]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/synthreg1/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | from sklearn.neural_network import MLPRegressor 12 | 13 | from pyrelational.oracles import BenchmarkOracle 14 | from pyrelational.pipeline import Pipeline 15 | 16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 17 | from ..regression_experiment_utils import ( 18 | EnsembleScikit, 19 | experiment_param_space, 20 | get_strategy_from_string, 21 | numpy_collate, 22 | ) 23 | from .data_manager import get_synthreg1_data_manager 24 | 25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 29 | seed = config["seed"] 30 | set_all_seeds(seed) 31 | strategy = get_strategy_from_string(config["strategy"]) 32 | data_manager = get_synthreg1_data_manager() 33 | model_config: Dict[str, Any] = { 34 | "random_state": seed, 35 | "max_iter": 1000, 36 | "hidden_layer_sizes": (32, 8, 4), 37 | "learning_rate_init": 3e-4, 38 | } 39 | trainer_config: Dict[str, Any] = {} 40 | model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config) 41 | oracle = BenchmarkOracle() 42 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 43 | 44 | # Annotating data step by step until the trainset is fully annotated 45 | pipeline.run(num_annotate=1, num_iterations=200) 46 | print(pipeline) 47 | 48 | iteration_metrics = [] 49 | for i in range(len(pipeline.performances)): 50 | if "test_metric" in pipeline.performances[i]: 51 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 52 | 53 | iteration_metrics = np.array(iteration_metrics) 54 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 55 | 56 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 57 | 58 | 59 | if __name__ == "__main__": 60 | EXPERIMENT_NAME = "results" 61 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 62 | 63 | trial = tune.with_resources(trial, {"cpu": 2}) 64 | tuner = tune.Tuner( 65 | trial, 66 | tune_config=tune.TuneConfig(num_samples=1), 67 | param_space=experiment_param_space, 68 | run_config=RunConfig( 69 | name=EXPERIMENT_NAME, 70 | storage_path=STORAGE_PATH, 71 | ), 72 | ) 73 | results_grid = tuner.fit() 74 | results_df = process_results_grid(results_grid=results_grid) 75 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 76 | -------------------------------------------------------------------------------- /benchmarking/synthreg2/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the SynthReg2 dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.synthetic import SynthReg2 12 | 13 | 14 | def get_synthreg2_data_manager() -> DataManager: 15 | ds = SynthReg2() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 50, 550]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/synthreg2/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | from sklearn.neural_network import MLPRegressor 12 | 13 | from pyrelational.oracles import BenchmarkOracle 14 | from pyrelational.pipeline import Pipeline 15 | 16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 17 | from ..regression_experiment_utils import ( 18 | EnsembleScikit, 19 | experiment_param_space, 20 | get_strategy_from_string, 21 | numpy_collate, 22 | ) 23 | from .data_manager import get_synthreg2_data_manager 24 | 25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 29 | seed = config["seed"] 30 | set_all_seeds(seed) 31 | strategy = get_strategy_from_string(config["strategy"]) 32 | data_manager = get_synthreg2_data_manager() 33 | model_config: Dict[str, Any] = { 34 | "random_state": seed, 35 | "max_iter": 1000, 36 | "hidden_layer_sizes": (32, 8, 4), 37 | "learning_rate_init": 3e-4, 38 | } 39 | trainer_config: Dict[str, Any] = {} 40 | model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config) 41 | oracle = BenchmarkOracle() 42 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 43 | 44 | # Annotating data step by step until the trainset is fully annotated 45 | pipeline.run(num_annotate=1, num_iterations=200) 46 | print(pipeline) 47 | 48 | iteration_metrics = [] 49 | for i in range(len(pipeline.performances)): 50 | if "test_metric" in pipeline.performances[i]: 51 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 52 | 53 | iteration_metrics = np.array(iteration_metrics) 54 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 55 | 56 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 57 | 58 | 59 | if __name__ == "__main__": 60 | EXPERIMENT_NAME = "results" 61 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 62 | 63 | trial = tune.with_resources(trial, {"cpu": 2}) 64 | tuner = tune.Tuner( 65 | trial, 66 | tune_config=tune.TuneConfig(num_samples=1), 67 | param_space=experiment_param_space, 68 | run_config=RunConfig( 69 | name=EXPERIMENT_NAME, 70 | storage_path=STORAGE_PATH, 71 | ), 72 | ) 73 | results_grid = tuner.fit() 74 | results_df = process_results_grid(results_grid=results_grid) 75 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 76 | -------------------------------------------------------------------------------- /benchmarking/wine/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the wine dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIWine 12 | 13 | 14 | def get_wine_data_manager() -> DataManager: 15 | ds = UCIWine() 16 | print(len(ds)) 17 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [1000, 100, 498]) 18 | train_indices = list(train_ds.indices) 19 | valid_indices = list(valid_ds.indices) 20 | test_indices = list(test_ds.indices) 21 | 22 | return DataManager( 23 | ds, 24 | train_indices=train_indices, 25 | validation_indices=valid_indices, 26 | test_indices=test_indices, 27 | labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(), 28 | loader_batch_size="full", 29 | loader_collate_fn=numpy_collate, 30 | ) 31 | 32 | 33 | def numpy_collate( 34 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 36 | """Collate function for a Pytorch to Numpy DataLoader""" 37 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 38 | -------------------------------------------------------------------------------- /benchmarking/wine/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | 12 | from pyrelational.oracles import BenchmarkOracle 13 | from pyrelational.pipeline import Pipeline 14 | 15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 16 | from ..regression_experiment_utils import ( 17 | EnsembleScikit, 18 | experiment_param_space, 19 | get_strategy_from_string, 20 | numpy_collate, 21 | ) 22 | from .data_manager import get_wine_data_manager 23 | 24 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 25 | 26 | 27 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 28 | seed = config["seed"] 29 | set_all_seeds(seed) 30 | strategy = get_strategy_from_string(config["strategy"]) 31 | data_manager = get_wine_data_manager() 32 | model_config: Dict[str, Any] = {"random_state": seed} 33 | trainer_config: Dict[str, Any] = {} 34 | model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config) 35 | oracle = BenchmarkOracle() 36 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 37 | 38 | # Annotating data step by step until the trainset is fully annotated 39 | pipeline.run(num_annotate=1, num_iterations=200) 40 | print(pipeline) 41 | 42 | iteration_metrics = [] 43 | for i in range(len(pipeline.performances)): 44 | if "test_metric" in pipeline.performances[i]: 45 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 46 | 47 | iteration_metrics = np.array(iteration_metrics) 48 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 49 | 50 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 51 | 52 | 53 | if __name__ == "__main__": 54 | EXPERIMENT_NAME = "results" 55 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 56 | 57 | trial = tune.with_resources(trial, {"cpu": 2}) 58 | tuner = tune.Tuner( 59 | trial, 60 | tune_config=tune.TuneConfig(num_samples=1), 61 | param_space=experiment_param_space, 62 | run_config=RunConfig( 63 | name=EXPERIMENT_NAME, 64 | storage_path=STORAGE_PATH, 65 | ), 66 | ) 67 | results_grid = tuner.fit() 68 | results_df = process_results_grid(results_grid=results_grid) 69 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 70 | -------------------------------------------------------------------------------- /benchmarking/yacht/data_manager.py: -------------------------------------------------------------------------------- 1 | """Benchmarking DataManager for the Yacht dataset 2 | """ 3 | 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union 5 | 6 | import numpy as np 7 | import torch 8 | from numpy.typing import NDArray 9 | 10 | from pyrelational.data_managers import DataManager 11 | from pyrelational.datasets.regression.uci import UCIYacht 12 | 13 | 14 | def get_yacht_data_manager() -> DataManager: 15 | ds = UCIYacht() 16 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [180, 20, 106]) 17 | train_indices = list(train_ds.indices) 18 | valid_indices = list(valid_ds.indices) 19 | test_indices = list(test_ds.indices) 20 | 21 | return DataManager( 22 | ds, 23 | train_indices=train_indices, 24 | validation_indices=valid_indices, 25 | test_indices=test_indices, 26 | labelled_indices=np.random.choice(train_indices, 20, replace=False).tolist(), 27 | loader_batch_size="full", 28 | loader_collate_fn=numpy_collate, 29 | ) 30 | 31 | 32 | def numpy_collate( 33 | batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]] 34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]: 35 | """Collate function for a Pytorch to Numpy DataLoader""" 36 | return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)] 37 | -------------------------------------------------------------------------------- /benchmarking/yacht/run.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import os 3 | from typing import Any, Dict, Union 4 | 5 | import numpy as np 6 | from numpy.typing import NDArray 7 | from ray import tune 8 | from ray.train import RunConfig 9 | from sklearn.linear_model import ElasticNet 10 | from sklearn.metrics import auc 11 | from sklearn.neural_network import MLPRegressor 12 | 13 | from pyrelational.oracles import BenchmarkOracle 14 | from pyrelational.pipeline import Pipeline 15 | 16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds 17 | from ..regression_experiment_utils import ( 18 | EnsembleScikit, 19 | experiment_param_space, 20 | get_strategy_from_string, 21 | numpy_collate, 22 | ) 23 | from .data_manager import get_yacht_data_manager 24 | 25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]: 29 | seed = config["seed"] 30 | set_all_seeds(seed) 31 | strategy = get_strategy_from_string(config["strategy"]) 32 | data_manager = get_yacht_data_manager() 33 | model_config: Dict[str, Any] = { 34 | "random_state": seed, 35 | "max_iter": 1000, 36 | "hidden_layer_sizes": (32, 8, 4), 37 | "early_stopping": True, 38 | "learning_rate_init": 3e-4, 39 | } 40 | trainer_config: Dict[str, Any] = {} 41 | model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config) 42 | oracle = BenchmarkOracle() 43 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 44 | 45 | # Annotating data step by step until the trainset is fully annotated 46 | pipeline.run(num_annotate=1) 47 | print(pipeline) 48 | 49 | iteration_metrics = [] 50 | for i in range(len(pipeline.performances)): 51 | if "test_metric" in pipeline.performances[i]: 52 | iteration_metrics.append(pipeline.performances[i]["test_metric"]) 53 | 54 | iteration_metrics = np.array(iteration_metrics) 55 | score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics) 56 | 57 | return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics} 58 | 59 | 60 | if __name__ == "__main__": 61 | EXPERIMENT_NAME = "results" 62 | STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results") 63 | 64 | trial = tune.with_resources(trial, {"cpu": 2}) 65 | tuner = tune.Tuner( 66 | trial, 67 | tune_config=tune.TuneConfig(num_samples=1), 68 | param_space=experiment_param_space, 69 | run_config=RunConfig( 70 | name=EXPERIMENT_NAME, 71 | storage_path=STORAGE_PATH, 72 | ), 73 | ) 74 | results_grid = tuner.fit() 75 | results_df = process_results_grid(results_grid=results_grid) 76 | save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME) 77 | -------------------------------------------------------------------------------- /default_configs/pyl_trainer_base_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "epochs": 100, 3 | "period_eval": 1, 4 | "checkpoints_dir": "experiment_logs/", 5 | "checkpoints_name": "run", 6 | "monitor_metric_name": "loss", 7 | "monitor_metric_mode": "min", 8 | "use_early_stopping": false, 9 | "patience": 100, 10 | "save_top_k": 1 11 | } 12 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SPHINXPROJ = PyRelationAL 9 | SOURCEDIR = source 10 | BUILDDIR = build 11 | 12 | # Put it first so that "make" without argument is like "make help". 13 | help: 14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 15 | 16 | .PHONY: help Makefile 17 | 18 | # Catch-all target: route all unknown targets to Sphinx using the new 19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 20 | %: Makefile 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/images/pyrelational_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/images/pyrelational_overview.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/_static/data_indices_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/_static/data_indices_diagram.png -------------------------------------------------------------------------------- /docs/source/_static/theme.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | min-width: 100% !important; 3 | } 4 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import datetime 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | 18 | import sphinx_rtd_theme 19 | 20 | sys.path.insert(0, os.path.abspath("../..")) 21 | import pyrelational # noqa: E402 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = "PyRelationAL" 26 | author = "Relation Therapeutics" 27 | copyright = f"{datetime.datetime.now().year}, {author}" 28 | 29 | # The full version, including alpha/beta/rc tags 30 | release = pyrelational.__version__ 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = [ 39 | "sphinx.ext.autodoc", 40 | "sphinx.ext.napoleon", 41 | "sphinx.ext.mathjax", 42 | "sphinx.ext.viewcode", 43 | ] 44 | 45 | source_suffix = ".rst" 46 | master_doc = "index" 47 | autoclass_content = "both" 48 | add_module_names = False 49 | 50 | # Add any paths that contain templates here, relative to this directory. 51 | templates_path = ["_templates"] 52 | 53 | # List of patterns, relative to source directory, that match files and 54 | # directories to ignore when looking for source files. 55 | # This pattern also affects html_static_path and html_extra_path. 56 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 57 | 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = "sphinx_rtd_theme" 65 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 66 | html_css_files = ["theme.css"] 67 | 68 | 69 | # Add any paths that contain custom static files (such as style sheets) here, 70 | # relative to this directory. They are copied after the builtin static files, 71 | # so a file named "default.css" will overwrite the builtin "default.css". 72 | html_static_path = ["_static"] 73 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyRelationAL documentation master file, created by 2 | sphinx-quickstart on Thu Jun 17 15:33:16 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/RelationRx/pyrelational 7 | 8 | Welcome to PyRelationAL's documentation! 9 | ======================================== 10 | 11 | **PyRelationAL** is a python active learning library developed by `Relation Therapeutics `_ for 12 | rapidly implementing active learning pipelines from data management, model development (and Bayesian approximation), to creating novel active learning strategies. 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | :caption: Notes 17 | 18 | notes/activelearning 19 | notes/installation 20 | notes/quick_start 21 | notes/using_your_own_data 22 | notes/using_the_model_api 23 | notes/using_your_own_strategy 24 | notes/benchmark_datasets 25 | 26 | .. toctree:: 27 | :glob: 28 | :maxdepth: 2 29 | :caption: Package modules 30 | 31 | reference/data.rst 32 | reference/datasets.rst 33 | reference/models.rst 34 | reference/informativeness.rst 35 | reference/strategies.rst 36 | reference/oracles.rst 37 | reference/pipeline.rst 38 | 39 | Indices and tables 40 | ================== 41 | 42 | * :ref:`genindex` 43 | * :ref:`modindex` 44 | * :ref:`search` 45 | 46 | 47 | If the library is useful for your work please consider citing **PyRelationAL**. 48 | 49 | .. code-block:: latex 50 | 51 | @misc{pyrelational, 52 | title={PyRelationAL}, 53 | author={Relation Therapeutics}, 54 | year={2021}, 55 | publisher = {GitHub} 56 | journal = {GitHub repository} 57 | howpublished = {\url{https://github.com/RelationRx/pyrelational}} 58 | } 59 | -------------------------------------------------------------------------------- /docs/source/notes/al_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/al_pipeline.png -------------------------------------------------------------------------------- /docs/source/notes/eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/eval.png -------------------------------------------------------------------------------- /docs/source/notes/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | We do not recommend to install our library as a root user on your 5 | system, but to set up an environment instead, using for example 6 | `Anaconda `__. 7 | To use the library, **you will need Python 3.8 or newer**. 8 | 9 | Installation via Pip Wheels 10 | --------------------------- 11 | 12 | You can install the PyRelationAL library directly using pip: 13 | 14 | :: 15 | 16 | pip install pyrelational 17 | 18 | Installation from Source 19 | ------------------------ 20 | 21 | Alternatively, you can install PyRelationAL directly from source: 22 | 23 | 1. install the relevant packages 24 | 25 | :: 26 | 27 | pip install numpy>=1.20, 28 | pip install pandas>=1.3, 29 | pip install pytorch-lightning>=1.5, 30 | pip install torch>=1.9.0, 31 | pip install scikit-learn>=1.0.2, 32 | 33 | 2. install additional packages to play with our examples: 34 | 35 | :: 36 | 37 | pip install torchvision>=0.10.0 38 | pip install gpytorch>=1.4 39 | -------------------------------------------------------------------------------- /docs/source/notes/performance_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/performance_comparison.png -------------------------------------------------------------------------------- /docs/source/notes/training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/training.png -------------------------------------------------------------------------------- /docs/source/reference/data.rst: -------------------------------------------------------------------------------- 1 | pyrelational.data_managers 2 | =========================== 3 | 4 | Data Manager 5 | -------------------------------------- 6 | 7 | .. automodule:: pyrelational.data_managers.data_manager 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /docs/source/reference/datasets.rst: -------------------------------------------------------------------------------- 1 | pyrelational.datasets 2 | ===================== 3 | 4 | 5 | Classification datasets 6 | ----------------------- 7 | 8 | The following classes contain a variety of classic classification datasets that have been used in different active learning papers. Each behaves the same as a PyTorch Dataset. 9 | 10 | .. automodule:: pyrelational.datasets.classification 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Regression datasets 16 | ----------------------- 17 | 18 | The following classes contain a variety of classic regression datasets that have been used in different active learning papers. Each behaves the same as a PyTorch Dataset. 19 | 20 | .. automodule:: pyrelational.datasets.regression 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | 26 | Benchmark DataManager 27 | --------------------- 28 | 29 | The following functions accept the datasets defined in this package to produce DataManagers containing labelling initialisations that correspond to cold and warm start active learning tasks. These can be used for benchmarking strategies quickly. 30 | 31 | .. automodule:: pyrelational.datasets.benchmark_datamanager 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | -------------------------------------------------------------------------------- /docs/source/reference/informativeness.rst: -------------------------------------------------------------------------------- 1 | pyrelational.informativeness 2 | ============================ 3 | 4 | Informativeness functions for regression tasks 5 | ---------------------------------------------- 6 | 7 | .. automodule:: pyrelational.informativeness.regression 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | Informativeness functions for classification tasks 13 | -------------------------------------------------- 14 | 15 | .. automodule:: pyrelational.informativeness.classification 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | Task agnostic informativeness functions 21 | ---------------------------------------- 22 | 23 | .. automodule:: pyrelational.informativeness.task_agnostic 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | -------------------------------------------------------------------------------- /docs/source/reference/models.rst: -------------------------------------------------------------------------------- 1 | pyrelational.model_managers 2 | =================== 3 | 4 | Abstract Model Manager 5 | ----------------------------------------- 6 | 7 | .. automodule:: pyrelational.model_managers.abstract_model_manager 8 | :members: 9 | :undoc-members: 10 | :special-members: __call__ 11 | :show-inheritance: 12 | 13 | Pytorch Lightning Model 14 | ------------------------------------------- 15 | 16 | .. automodule:: pyrelational.model_managers.lightning_model_manager 17 | :members: 18 | :undoc-members: 19 | :special-members: __call__ 20 | :show-inheritance: 21 | 22 | Ensemble Models 23 | ------------------------------------------ 24 | 25 | .. automodule:: pyrelational.model_managers.ensemble_model_manager 26 | :members: 27 | :undoc-members: 28 | :special-members: __call__ 29 | :show-inheritance: 30 | 31 | MCDropout Models 32 | ------------------------------------------- 33 | 34 | .. automodule:: pyrelational.model_managers.mcdropout_model_manager 35 | :members: 36 | :undoc-members: 37 | :special-members: __call__ 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/reference/oracles.rst: -------------------------------------------------------------------------------- 1 | pyrelational.oracles 2 | ==================== 3 | 4 | Abstract Oracle 5 | -------------------------------------------- 6 | 7 | .. automodule:: pyrelational.oracles.abstract_oracle 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | Benchmark Oracle 13 | --------------------------------------------- 14 | 15 | .. automodule:: pyrelational.oracles.benchmark_oracle 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | -------------------------------------------------------------------------------- /docs/source/reference/pipeline.rst: -------------------------------------------------------------------------------- 1 | pyrelational.pipeline 2 | ===================== 3 | 4 | Pipeline 5 | ------------------------------------- 6 | 7 | .. automodule:: pyrelational.pipeline.pipeline 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | -------------------------------------------------------------------------------- /examples/demo/ensemble_uncertainty_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a classification 3 | task with the active learning library 4 | 5 | It illustrates the ensemble method. 6 | """ 7 | 8 | # User imports 9 | import logging 10 | 11 | import torch 12 | from torchvision import datasets, transforms 13 | 14 | # Dataset and machine learning model 15 | from utils.ml_models import MnistClassification 16 | 17 | # Active Learning package 18 | from pyrelational.data_managers import DataManager 19 | from pyrelational.model_managers import LightningEnsembleModelManager 20 | from pyrelational.oracles import BenchmarkOracle 21 | from pyrelational.pipeline import Pipeline 22 | from pyrelational.strategies.classification import LeastConfidenceStrategy 23 | 24 | # dataset 25 | dataset = datasets.FashionMNIST(root="data", train=True, download=True, transform=transforms.ToTensor()) 26 | 27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [9000, 26000, 25000]) 28 | train_indices = train_ds.indices 29 | val_indices = val_ds.indices 30 | test_indices = test_ds.indices 31 | 32 | # model manager 33 | model_manager = LightningEnsembleModelManager( 34 | model_class=MnistClassification, model_config={}, trainer_config={"epochs": 5}, n_estimators=5 35 | ) 36 | 37 | # data_manager and defining strategy 38 | data_manager = DataManager( 39 | dataset=dataset, 40 | train_indices=train_indices, 41 | validation_indices=val_indices, 42 | test_indices=test_indices, 43 | loader_batch_size=1000, 44 | label_attr="targets", 45 | ) 46 | 47 | # Set up active learning pipeline 48 | strategy = LeastConfidenceStrategy() 49 | oracle = BenchmarkOracle() 50 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 51 | 52 | # Remove lightning prints 53 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 54 | 55 | # See performance with the full trainset labelled 56 | pipeline.compute_theoretical_performance() 57 | 58 | # New data to be annotated, followed by an update of the data_manager and model 59 | to_annotate = pipeline.step(num_annotate=1000) 60 | pipeline.query(indices=to_annotate) 61 | 62 | # Annotating data step by step until the trainset is fully annotated 63 | pipeline.run(num_annotate=1000) 64 | 65 | # Pretty printed summary of the components in the pipeline along with annotation/performance history 66 | print(pipeline) 67 | -------------------------------------------------------------------------------- /examples/demo/lightning_diversity_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a classification 3 | task with the active learning library 4 | 5 | Here we give an example of defining your own custom AL strategy 6 | """ 7 | 8 | import logging 9 | 10 | import torch 11 | 12 | # Dataset and machine learning model 13 | from utils.datasets import BreastCancerDataset 14 | from utils.ml_models import BreastCancerClassification 15 | 16 | # Active Learning package 17 | from pyrelational.data_managers import DataManager 18 | from pyrelational.model_managers import LightningModelManager 19 | from pyrelational.oracles import BenchmarkOracle 20 | from pyrelational.pipeline import Pipeline 21 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import ( 22 | RelativeDistanceStrategy, 23 | ) 24 | 25 | # Obtain dataset and set up labelled and unlabelled subsets 26 | dataset = BreastCancerDataset() 27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [500, 30, 39]) 28 | train_indices = train_ds.indices 29 | val_indices = val_ds.indices 30 | test_indices = test_ds.indices 31 | 32 | # Instantiate model_manager 33 | model_manager = LightningModelManager( 34 | model_class=BreastCancerClassification, model_config={}, trainer_config={"epochs": 4} 35 | ) 36 | 37 | # data_manager and defining strategy 38 | data_manager = DataManager( 39 | dataset=dataset, 40 | train_indices=train_indices, 41 | validation_indices=val_indices, 42 | test_indices=test_indices, 43 | hit_ratio_at=5, 44 | ) 45 | 46 | # Setup 47 | strategy = RelativeDistanceStrategy() 48 | oracle = BenchmarkOracle() 49 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 50 | 51 | # Remove lightning prints 52 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 53 | 54 | # performance with the full trainset labelled 55 | pipeline.compute_theoretical_performance() 56 | 57 | # New data to be annotated, followed by an update of the data_manager and model 58 | to_annotate = pipeline.step(num_annotate=100) 59 | pipeline.query(indices=to_annotate) 60 | 61 | # Annotating data step by step until the trainset is fully annotated 62 | pipeline.run(num_annotate=100) 63 | 64 | # Pretty printed summary of the components in the pipeline along with annotation/performance history 65 | print(pipeline) 66 | -------------------------------------------------------------------------------- /examples/demo/lightning_diversity_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a regression 3 | task with the active learning library 4 | 5 | Here we give an example of defining your own custom AL strategy 6 | """ 7 | 8 | import logging 9 | 10 | import torch 11 | 12 | # Dataset and machine learning model 13 | from utils.datasets import DiabetesDataset 14 | from utils.ml_models import DiabetesRegression 15 | 16 | # Active Learning package 17 | from pyrelational.data_managers import DataManager 18 | from pyrelational.model_managers import LightningModelManager 19 | from pyrelational.oracles import BenchmarkOracle 20 | from pyrelational.pipeline import Pipeline 21 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import ( 22 | RelativeDistanceStrategy, 23 | ) 24 | 25 | # dataset 26 | dataset = DiabetesDataset() 27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [400, 22, 20]) 28 | train_indices = train_ds.indices 29 | val_indices = val_ds.indices 30 | test_indices = test_ds.indices 31 | 32 | # model_manager 33 | model_manager = LightningModelManager(model_class=DiabetesRegression, model_config={}, trainer_config={"epochs": 4}) 34 | 35 | # data_manager and defining strategy 36 | data_manager = DataManager( 37 | dataset=dataset, 38 | train_indices=train_indices, 39 | validation_indices=val_indices, 40 | test_indices=test_indices, 41 | hit_ratio_at=5, 42 | ) 43 | 44 | # Setup pipeline 45 | strategy = RelativeDistanceStrategy() 46 | oracle = BenchmarkOracle() 47 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 48 | 49 | 50 | # Remove lightning prints 51 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 52 | 53 | # performance with the full trainset labelled 54 | pipeline.compute_theoretical_performance() 55 | 56 | # New data to be annotated, followed by an update of the data_manager and model 57 | to_annotate = pipeline.step(num_annotate=100) 58 | pipeline.query(indices=to_annotate) 59 | 60 | # Annotating data step by step until the trainset is fully annotated 61 | pipeline.run(num_annotate=100) 62 | print(pipeline) 63 | -------------------------------------------------------------------------------- /examples/demo/lightning_mixed_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | TO DO: illustrate with dataset 3 | """ 4 | 5 | import torch 6 | 7 | from pyrelational.batch_mode_samplers import TopKSampler 8 | from pyrelational.informativeness import StandardDeviation 9 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 10 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import ( 11 | representative_sampling, 12 | ) 13 | 14 | 15 | class MixedStrategy(RegressionStrategy): 16 | """Implements a strategy that combines least_confidence scorer with representative sampling. 17 | To this end, 10 times more samples than requested are selected based on least_confidence scorer, 18 | the list is then reduced based on representative_sampling. 19 | """ 20 | 21 | def __init__(self, clustering_method: str, oversample_factor: int = 10): 22 | super().__init__(StandardDeviation(), TopKSampler()) 23 | self.clustering_method = clustering_method 24 | self.oversample_factor = oversample_factor 25 | 26 | def __call__(self, num_annotate, data_manager, model_manager): 27 | ixs = super().__call__(num_annotate * self.oversample_factor, data_manager, model_manager) 28 | subquery = torch.stack(data_manager.get_sample_feature_vectors(ixs)) 29 | new_ixs = representative_sampling(subquery, num_annotate, self.clustering_method) 30 | return [ixs[i] for i in new_ixs] 31 | -------------------------------------------------------------------------------- /examples/demo/lightning_representative_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a classification 3 | task with the active learning library 4 | 5 | This example illustrates the Representative Sampling strategy. 6 | """ 7 | 8 | import logging 9 | 10 | import torch 11 | 12 | # Dataset and machine learning model 13 | from utils.datasets import BreastCancerDataset 14 | from utils.ml_models import BreastCancerClassification 15 | 16 | # Active Learning package 17 | from pyrelational.data_managers import DataManager 18 | from pyrelational.model_managers import LightningModelManager 19 | from pyrelational.oracles import BenchmarkOracle 20 | from pyrelational.pipeline import Pipeline 21 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import ( 22 | RepresentativeSamplingStrategy, 23 | ) 24 | 25 | # dataset 26 | dataset = BreastCancerDataset() 27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [500, 30, 39]) 28 | train_indices = train_ds.indices 29 | val_indices = val_ds.indices 30 | test_indices = test_ds.indices 31 | 32 | # model_manager 33 | model_manager = LightningModelManager( 34 | model_class=BreastCancerClassification, model_config={}, trainer_config={"epochs": 4} 35 | ) 36 | 37 | # data_manager and defining strategy 38 | data_manager = DataManager( 39 | dataset=dataset, 40 | train_indices=train_indices, 41 | validation_indices=val_indices, 42 | test_indices=test_indices, 43 | loader_batch_size=100, 44 | ) 45 | 46 | # Setup 47 | strategy = RepresentativeSamplingStrategy(clustering_method="AffinityPropagation") 48 | oracle = BenchmarkOracle() 49 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 50 | 51 | # Remove lightning prints 52 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 53 | 54 | # performance with the full trainset labelled 55 | pipeline.compute_theoretical_performance() 56 | 57 | # New data to be annotated, followed by an update of the data_manager and model 58 | to_annotate = pipeline.step(num_annotate=100) 59 | pipeline.query(indices=to_annotate) 60 | 61 | # Annotating data step by step until the trainset is fully annotated 62 | pipeline.run(num_annotate=100) 63 | print(pipeline) 64 | -------------------------------------------------------------------------------- /examples/demo/mcdropout_uncertainty_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a classification 3 | task with the active learning library 4 | 5 | This example will use uncertainty arising from the standard deviation of the 6 | predictive distribution obtained via MCDropout 7 | """ 8 | 9 | import logging 10 | 11 | import torch 12 | 13 | # Pytorch 14 | from torchvision import datasets, transforms 15 | 16 | # Dataset and machine learning model 17 | from utils.ml_models import MnistClassification 18 | 19 | # Active Learning package 20 | from pyrelational.data_managers import DataManager 21 | from pyrelational.model_managers import LightningMCDropoutModelManager 22 | from pyrelational.oracles import BenchmarkOracle 23 | from pyrelational.pipeline import Pipeline 24 | from pyrelational.strategies.classification import LeastConfidenceStrategy 25 | 26 | # dataset 27 | dataset = datasets.FashionMNIST(root="data", train=True, download=True, transform=transforms.ToTensor()) 28 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [9000, 25000, 26000]) 29 | train_indices = train_ds.indices 30 | val_indices = val_ds.indices 31 | test_indices = test_ds.indices 32 | 33 | # model 34 | model_manager = LightningMCDropoutModelManager( 35 | model_class=MnistClassification, model_config={"dropout": 0.2}, trainer_config={"epochs": 4} 36 | ) 37 | 38 | # data_manager and defining strategy 39 | data_manager = DataManager( 40 | dataset=dataset, 41 | train_indices=train_indices, 42 | validation_indices=val_indices, 43 | test_indices=test_indices, 44 | loader_batch_size=1000, 45 | label_attr="targets", 46 | ) 47 | 48 | strategy = LeastConfidenceStrategy() 49 | oracle = BenchmarkOracle() 50 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 51 | 52 | # Remove lightning prints 53 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 54 | 55 | # performance with the full trainset labelled 56 | pipeline.compute_theoretical_performance() 57 | 58 | # New data to be annotated, followed by an update of the data_manager and model 59 | to_annotate = pipeline.step(num_annotate=1000) 60 | pipeline.query(indices=to_annotate) 61 | 62 | # Annotating data step by step until the trainset is fully annotated 63 | pipeline.run(num_annotate=1000) 64 | print(pipeline) 65 | -------------------------------------------------------------------------------- /examples/demo/mcdropout_uncertainty_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a toy self-contained example of active learning on a regression 3 | task with the active learning library 4 | 5 | This example will use uncertainty arising from the standard deviation of the 6 | predictive distribution obtained via MCDropout 7 | """ 8 | 9 | import logging 10 | 11 | import torch 12 | 13 | # Dataset and machine learning model 14 | from utils.datasets import DiabetesDataset 15 | from utils.ml_models import DiabetesRegression 16 | 17 | # Active Learning package 18 | from pyrelational.data_managers import DataManager 19 | from pyrelational.model_managers import LightningMCDropoutModelManager 20 | from pyrelational.oracles import BenchmarkOracle 21 | from pyrelational.pipeline import Pipeline 22 | from pyrelational.strategies.regression import VarianceReductionStrategy 23 | 24 | # dataset 25 | dataset = DiabetesDataset() 26 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [400, 22, 20]) 27 | train_indices = train_ds.indices 28 | val_indices = val_ds.indices 29 | test_indices = test_ds.indices 30 | 31 | # model_manager 32 | model_manager = LightningMCDropoutModelManager( 33 | model_class=DiabetesRegression, model_config={}, trainer_config={"epochs": 4} 34 | ) 35 | 36 | # data_manager and defining strategy 37 | data_manager = DataManager( 38 | dataset=dataset, train_indices=train_indices, validation_indices=val_indices, test_indices=test_indices 39 | ) 40 | 41 | 42 | strategy = VarianceReductionStrategy() 43 | oracle = BenchmarkOracle() 44 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle) 45 | 46 | # Remove lightning prints 47 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR) 48 | 49 | # performance with the full trainset labelled 50 | pipeline.compute_theoretical_performance() 51 | 52 | # New data to be annotated, followed by an update of the data_manager and model 53 | to_annotate = pipeline.step(num_annotate=100) 54 | pipeline.query(indices=to_annotate) 55 | 56 | # Annotating data step by step until the trainset is fully annotated 57 | pipeline.run(num_annotate=100) 58 | print(pipeline) 59 | -------------------------------------------------------------------------------- /examples/demo/utils/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple datasets in PyTorch to use in examples 3 | """ 4 | 5 | import torch 6 | from sklearn.datasets import load_breast_cancer, load_diabetes 7 | from torch.utils.data import Dataset 8 | 9 | 10 | class DiabetesDataset(Dataset): 11 | """A small regression dataset for examples""" 12 | 13 | def __init__(self): 14 | # Load the diabetes dataset 15 | diabetes_X, diabetes_y = load_diabetes(return_X_y=True) 16 | self.x = torch.FloatTensor(diabetes_X) 17 | self.y = torch.FloatTensor(diabetes_y) 18 | 19 | def __len__(self): 20 | return self.x.shape[0] 21 | 22 | def __getitem__(self, idx): 23 | return self.x[idx], self.y[idx] 24 | 25 | 26 | class BreastCancerDataset(Dataset): 27 | """A small classification dataset for examples""" 28 | 29 | def __init__(self): 30 | super(BreastCancerDataset, self).__init__() 31 | sk_x, sk_y = load_breast_cancer(return_X_y=True) 32 | self.x = torch.FloatTensor(sk_x) 33 | self.y = torch.LongTensor(sk_y) 34 | 35 | def __len__(self): 36 | return self.x.shape[0] 37 | 38 | def __getitem__(self, idx): 39 | return self.x[idx], self.y[idx] 40 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 120 3 | target-version = ['py38', 'py39'] 4 | include = '\.pyi?$' 5 | exclude = ''' 6 | /( 7 | )/ 8 | ''' 9 | -------------------------------------------------------------------------------- /pyrelational/__init__.py: -------------------------------------------------------------------------------- 1 | import pyrelational.data_managers 2 | import pyrelational.informativeness 3 | import pyrelational.model_managers 4 | import pyrelational.pipeline 5 | import pyrelational.strategies 6 | from pyrelational.version import __version__ 7 | -------------------------------------------------------------------------------- /pyrelational/batch_mode_samplers/__init__.py: -------------------------------------------------------------------------------- 1 | """Query samplers module.""" 2 | 3 | from pyrelational.batch_mode_samplers._batch_mode_samplers import ( 4 | BatchModeSampler, 5 | ProbabilisticSampler, 6 | TopKSampler, 7 | ) 8 | -------------------------------------------------------------------------------- /pyrelational/batch_mode_samplers/_batch_mode_samplers.py: -------------------------------------------------------------------------------- 1 | """Collection of samplers for active learning strategies.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import List 5 | 6 | import torch 7 | from torch import Tensor 8 | 9 | 10 | class BatchModeSampler(ABC): 11 | """Abstract sampler class.""" 12 | 13 | @abstractmethod 14 | def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]: 15 | """Sample a subset of indices based on the scores. 16 | 17 | This method should be implemented by the subclasses. 18 | :return: List of sampled indices. 19 | """ 20 | pass 21 | 22 | 23 | class TopKSampler(BatchModeSampler): 24 | """Deterministic sampler based on the top-k scores.""" 25 | 26 | def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]: 27 | """Sample the top-k indices based on the scores. 28 | 29 | :return: List of sampled indices. 30 | """ 31 | ixs = torch.argsort(scores, descending=True).tolist() 32 | return [indices[i] for i in ixs[:num_samples]] 33 | 34 | 35 | class ProbabilisticSampler(BatchModeSampler): 36 | """Probabilistic sampler based on the scores.""" 37 | 38 | def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]: 39 | """Sample a subset of indices deriving a distribution from the scores. 40 | 41 | :return: List of sampled indices. 42 | """ 43 | num_samples = min(num_samples, len(indices)) 44 | return [indices[i] for i in torch.multinomial(scores, num_samples, replacement=False).tolist()] 45 | -------------------------------------------------------------------------------- /pyrelational/data_managers/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.data_managers.data_manager import DataManager 2 | -------------------------------------------------------------------------------- /pyrelational/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.datasets.classification import ( 2 | MNIST, 3 | BreastCancerDataset, 4 | Checkerboard2x2Dataset, 5 | Checkerboard4x4Dataset, 6 | CreditCardDataset, 7 | DigitDataset, 8 | FashionMNIST, 9 | GaussianCloudsDataset, 10 | StriatumDataset, 11 | SynthClass1, 12 | SynthClass2, 13 | SynthClass3, 14 | UCIGlass, 15 | UCIParkinsons, 16 | UCISeeds, 17 | ) 18 | from pyrelational.datasets.drugcomb import DrugCombDataset 19 | from pyrelational.datasets.regression import ( 20 | DiabetesDataset, 21 | SynthReg1, 22 | SynthReg2, 23 | UCIAirfoil, 24 | UCIConcrete, 25 | UCIEnergy, 26 | UCIPower, 27 | UCIWine, 28 | UCIYacht, 29 | ) 30 | from pyrelational.datasets.uci_datasets import UCIDatasets 31 | -------------------------------------------------------------------------------- /pyrelational/datasets/base.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | from torch import Tensor 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class BaseDataset(Dataset[Tuple[Tensor, Tensor]]): 8 | """A base class for all datasets to inherit from. 9 | 10 | :param n_splits: Number of splits for cross-validation. 11 | :param random_seed: Seed for random number generator for reproducibility. 12 | """ 13 | 14 | x: Tensor 15 | y: Tensor 16 | 17 | def __init__(self, n_splits: int = 3, random_seed: int = 42): 18 | """Initialize the BaseDataset with the number of splits and a seed for reproducibility. 19 | 20 | :param n_splits: Number of splits for stratified k-fold. 21 | :param random_seed: Random seed for reproducibility. 22 | """ 23 | super(BaseDataset, self).__init__() 24 | self.n_splits = n_splits 25 | self.random_seed = random_seed 26 | 27 | def __len__(self) -> int: 28 | """Return the total number of samples in the dataset. 29 | 30 | :return: Total number of samples. 31 | """ 32 | return len(self.x) 33 | 34 | def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]: 35 | """Fetch the sample and its corresponding label at the given index. 36 | 37 | :param idx: Index of the sample to retrieve. 38 | :return: Tuple containing the sample and its label. 39 | """ 40 | return self.x[idx], self.y[idx] 41 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.datasets.classification.andrea_et_al import CreditCardDataset 2 | from pyrelational.datasets.classification.fashion_mnist import FashionMNIST 3 | from pyrelational.datasets.classification.ksenia_et_al import ( 4 | Checkerboard2x2Dataset, 5 | Checkerboard4x4Dataset, 6 | GaussianCloudsDataset, 7 | StriatumDataset, 8 | ) 9 | from pyrelational.datasets.classification.mnist import MNIST 10 | from pyrelational.datasets.classification.scikit_learn import ( 11 | BreastCancerDataset, 12 | DigitDataset, 13 | ) 14 | from pyrelational.datasets.classification.synthetic import ( 15 | SynthClass1, 16 | SynthClass2, 17 | SynthClass3, 18 | ) 19 | from pyrelational.datasets.classification.uci import UCIGlass, UCIParkinsons, UCISeeds 20 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/andrea_et_al.py: -------------------------------------------------------------------------------- 1 | import pyreadr 2 | import torch 3 | 4 | from pyrelational.datasets.base import BaseDataset 5 | from pyrelational.datasets.download_utils import download_file 6 | 7 | from .utils import create_splits, remap_to_int 8 | 9 | 10 | class CreditCardDataset(BaseDataset): 11 | """Credit card fraud dataset, highly unbalanced and challenging. 12 | 13 | From Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson, and Gianluca Bontempi. 14 | Calibrating probability with undersampling for unbalanced classification. In 2015 15 | IEEE Symposium Series on Computational Intelligence, pages 159–166, 2015. 16 | 17 | We use the original data from http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata 18 | processed using pyreadr 19 | 20 | :param data_dir: path where to save the raw data default to /tmp/ 21 | :param n_splits: an int describing the number of class stratified 22 | splits to compute 23 | :param random_seed: random seed for reproducibility on splits 24 | """ 25 | 26 | raw_url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" 27 | 28 | def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0): 29 | super().__init__(n_splits=n_splits, random_seed=random_seed) 30 | self.data_dir = data_dir 31 | self.n_splits = n_splits 32 | self._load_dataset() 33 | 34 | def _load_dataset(self) -> None: 35 | download_file(self.raw_url, self.data_dir) 36 | file_name = self.raw_url.split("/")[-1] 37 | data = pyreadr.read_r(self.data_dir + file_name) 38 | 39 | data = data["creditcard"] 40 | data.reset_index(inplace=True) 41 | xcols = data.columns[1:-1] 42 | self.x = torch.from_numpy(data[xcols].to_numpy()).float() 43 | self.y = remap_to_int(torch.from_numpy(data["Class"].to_numpy().astype(int))) 44 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 45 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/fashion_mnist.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | from torch import Tensor 5 | from torch.utils.data import ConcatDataset 6 | from torchvision import datasets, transforms 7 | 8 | from pyrelational.datasets.base import BaseDataset 9 | from pyrelational.datasets.classification.utils import create_splits 10 | 11 | 12 | class FashionMNIST(BaseDataset): 13 | """Fashion MNIST dataset class that handles downloading, transforming, and loading Fashion MNIST data. 14 | 15 | This dataset includes images from 10 categories of clothing, each represented as a 28x28 grayscale image. 16 | :param data_dir: Directory to store or read the Fashion MNIST data. 17 | :param n_splits: Number of stratified splits for the dataset. 18 | :param random_seed: Seed for random number generator for reproducibility. 19 | """ 20 | 21 | def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 1234): 22 | """Instantiate the FashionMNIST dataset class. 23 | 24 | :param data_dir: directory where to download the data, defaults to "/tmp/" 25 | :param n_splits: number of splits to generate, defaults to 5 26 | :param random_seed: random seed, defaults to 1234 27 | """ 28 | super().__init__(n_splits=n_splits, random_seed=random_seed) 29 | self.data_dir = data_dir 30 | self._load_data() 31 | 32 | def _load_data(self) -> None: 33 | """Load the Fashion MNIST dataset from torchvision datasets. 34 | 35 | We apply a transformation to convert images to tensors, and concatenates the train and test datasets into 36 | a single dataset for unified handling. 37 | """ 38 | train_dataset = datasets.FashionMNIST( 39 | root=self.data_dir, train=True, download=True, transform=transforms.ToTensor() 40 | ) 41 | test_dataset = datasets.FashionMNIST( 42 | root=self.data_dir, train=False, download=True, transform=transforms.ToTensor() 43 | ) 44 | 45 | # Concatenate the train and test datasets 46 | self.full_dataset: ConcatDataset[Tuple[Tensor, Tensor]] = ConcatDataset([train_dataset, test_dataset]) 47 | self.x = torch.stack([(self.full_dataset[i][0]).flatten() for i in range(len(self.full_dataset))]) 48 | self.y = torch.stack([torch.tensor(self.full_dataset[i][1]) for i in range(len(self.full_dataset))]) 49 | 50 | # Create splits for cross-validation 51 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 52 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/mnist.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | import torch 5 | from torch import Tensor 6 | from torch.utils.data import ConcatDataset 7 | from torchvision import datasets, transforms 8 | 9 | from pyrelational.datasets.base import BaseDataset 10 | 11 | 12 | class MNIST(BaseDataset): 13 | """ 14 | MNIST dataset class that handles downloading, transforming, and loading MNIST data. 15 | 16 | :param data_dir: Directory to store or read the MNIST data. 17 | :param n_splits: Number of stratified splits for the dataset. 18 | :param random_seed: Seed for random number generator for reproducibility. 19 | """ 20 | 21 | def __init__(self, data_dir: str = "/tmp/", random_seed: int = 1234): 22 | """Instantiate the MNIST dataset class. 23 | 24 | :param data_dir: directory where to download the data, defaults to "/tmp/" 25 | :param random_seed: random seed, defaults to 1234 26 | """ 27 | super().__init__(random_seed=random_seed) 28 | self.data_dir = data_dir 29 | self._load_data() 30 | 31 | def _load_data(self) -> None: 32 | """Load the MNIST dataset from torchvision datasets. 33 | 34 | We apply the standard transformation with tensor conversation and normalisation. 35 | We concatenate the train and test datasets into a single dataset for unified handling, but 36 | we keep the same fixed test set. 37 | """ 38 | transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) 39 | train_dataset = datasets.MNIST(root=self.data_dir, train=True, download=True, transform=transform) 40 | test_dataset = datasets.MNIST(root=self.data_dir, train=False, download=True, transform=transform) 41 | 42 | # Concatenate the train and test datasets 43 | self.full_dataset: ConcatDataset[Tuple[Tensor, Tensor]] = ConcatDataset([train_dataset, test_dataset]) 44 | self.x = torch.stack([(self.full_dataset[i][0]) for i in range(len(self.full_dataset))]) 45 | self.y = torch.stack([torch.tensor(self.full_dataset[i][1]) for i in range(len(self.full_dataset))]) 46 | 47 | # Create splits for cross-validation 48 | train_ix, test_ix = np.arange(len(train_dataset)), np.arange(len(test_dataset)) + len(train_dataset) 49 | self.data_splits = [(train_ix, test_ix)] 50 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/scikit_learn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.datasets import load_breast_cancer, load_digits 3 | 4 | from pyrelational.datasets.base import BaseDataset 5 | 6 | from .utils import create_splits 7 | 8 | 9 | class BreastCancerDataset(BaseDataset): 10 | """ 11 | UCI ML Breast Cancer Wisconsin (Diagnostic) dataset handler. 12 | 13 | This dataset features measurements from digitized images of breast mass and uses these features to classify 14 | the observations as benign or malignant. 15 | 16 | :param n_splits: Number of stratified splits for cross-validation. 17 | :param random_seed: Seed for random number generator for reproducibility. 18 | """ 19 | 20 | def __init__(self, n_splits: int = 5, random_seed: int = 0): 21 | super().__init__(n_splits=n_splits, random_seed=random_seed) 22 | self._load_data() 23 | 24 | def _load_data(self) -> None: 25 | """ 26 | Load and preprocess the Breast Cancer dataset. This method handles the conversion of the dataset into tensors 27 | suitable for model input and sets up splits. 28 | """ 29 | data, labels = load_breast_cancer(return_X_y=True) 30 | self.x = torch.tensor(data, dtype=torch.float) 31 | self.y = torch.tensor(labels, dtype=torch.long) 32 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 33 | 34 | 35 | class DigitDataset(BaseDataset): 36 | """UCI ML hand-written digits datasets 37 | 38 | From C. Kaynak (1995) Methods of Combining Multiple Classifiers and 39 | Their Applications to Handwritten Digit Recognition, MSc Thesis, 40 | Institute of Graduate Studies in Science and Engineering, Bogazici 41 | University. 42 | 43 | :param n_splits: an int describing the number of class stratified 44 | splits to compute 45 | :param random_seed: int setting the random seed for reproducibility 46 | """ 47 | 48 | def __init__(self, n_splits: int = 5, random_seed: int = 0): 49 | super().__init__(n_splits=n_splits, random_seed=random_seed) 50 | self._load_data() 51 | 52 | def _load_data(self) -> None: 53 | """ 54 | Load and preprocess the Digit dataset. This method handles the conversion of the dataset into tensors 55 | suitable for model input and sets up splits. 56 | """ 57 | sk_x, sk_y = load_digits(return_X_y=True) 58 | self.x = torch.FloatTensor(sk_x) 59 | self.y = torch.LongTensor(sk_y) 60 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 61 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/uci.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from pyrelational.datasets.base import BaseDataset 4 | from pyrelational.datasets.uci_datasets import UCIDatasets 5 | 6 | from .utils import remap_to_int 7 | 8 | 9 | class UCIClassification(BaseDataset): 10 | """ 11 | A generic class for handling UCI datasets, providing mechanisms to download, preprocess, and split the dataset. 12 | 13 | :param name: Identifier for the UCI dataset. 14 | :param data_dir: Directory where datasets are stored or will be downloaded. 15 | :param n_splits: Number of stratified splits for cross-validation. 16 | :param random_seed: Random seed for reproducibility of splits. 17 | """ 18 | 19 | def __init__(self, name: str, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0): 20 | super().__init__(n_splits=n_splits, random_seed=random_seed) 21 | self.data_dir = data_dir 22 | self.dataset = UCIDatasets(name=name, data_dir=data_dir, n_splits=n_splits, random_seed=random_seed) 23 | self._load_data() 24 | 25 | def _load_data(self) -> None: 26 | """ 27 | Load and preprocess the dataset. This involves loading the data using UCIDatasets, 28 | possibly transforming it, and preparing it for model training. 29 | """ 30 | data, labels = self.dataset.get_data() 31 | self.x = torch.tensor(data, dtype=torch.float) 32 | self.y = torch.tensor(labels, dtype=torch.long) 33 | self.y = remap_to_int(self.y) 34 | self.name = self.dataset.name 35 | self.data_splits = self.dataset.data_splits 36 | 37 | 38 | class UCIGlass(UCIClassification): 39 | """ 40 | UCI Glass dataset for classification tasks. 41 | 42 | Inherits from UCIClassification and uses its mechanisms to load and preprocess the Glass dataset specifically. 43 | """ 44 | 45 | def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0): 46 | super().__init__(name="glass", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed) 47 | 48 | 49 | class UCIParkinsons(UCIClassification): 50 | """ 51 | UCI Parkinsons dataset for classification tasks. 52 | 53 | Inherits from UCIClassification and uses its mechanisms to load and 54 | preprocess the Parkinsons dataset specifically. 55 | """ 56 | 57 | def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0): 58 | super().__init__(name="parkinsons", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed) 59 | 60 | 61 | class UCISeeds(UCIClassification): 62 | """ 63 | UCI Seeds dataset for classification tasks. 64 | 65 | Inherits from UCIClassification and uses its mechanisms to load and 66 | preprocess the Seeds dataset specifically. 67 | """ 68 | 69 | def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0): 70 | super().__init__(name="seeds", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed) 71 | -------------------------------------------------------------------------------- /pyrelational/datasets/classification/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | from sklearn.model_selection import StratifiedKFold 6 | from torch import Tensor 7 | 8 | 9 | def remap_to_int(torch_class_array: Tensor) -> Tensor: 10 | """Remap the elements in a torch tensor to contiguous integers starting from 0. 11 | 12 | This is useful for classification tasks where class labels should start from zero and be contiguous. 13 | :param torch_class_array: A torch.Tensor containing class labels, possibly non-integer or non-contiguous. 14 | :return: A torch.Tensor with class labels remapped to integers starting from 0. 15 | 16 | Example: 17 | >>> torch_class_array = torch.tensor([10, 10, 20, 20, 30]) 18 | >>> remap_to_int(torch_class_array) 19 | tensor([0, 0, 1, 1, 2]) 20 | """ 21 | remapped_labels: Tensor = torch_class_array.unique(return_inverse=True)[1] 22 | return remapped_labels 23 | 24 | 25 | def create_splits( 26 | x: Tensor, y: Tensor, n_splits: int, random_seed: int 27 | ) -> List[Tuple[NDArray[np.int_], NDArray[np.int_]]]: 28 | """Create stratified k-fold splits for the dataset using the dataset's features and labels.""" 29 | skf = StratifiedKFold(n_splits=n_splits, random_state=random_seed, shuffle=True) 30 | return list(skf.split(x.numpy(), y.numpy())) 31 | -------------------------------------------------------------------------------- /pyrelational/datasets/regression/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.datasets.regression.scikit_learn import DiabetesDataset 2 | from pyrelational.datasets.regression.synthetic import SynthReg1, SynthReg2 3 | from pyrelational.datasets.regression.uci import ( 4 | UCIAirfoil, 5 | UCIConcrete, 6 | UCIEnergy, 7 | UCIPower, 8 | UCIWine, 9 | UCIYacht, 10 | ) 11 | -------------------------------------------------------------------------------- /pyrelational/datasets/regression/scikit_learn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.datasets import load_diabetes 3 | 4 | from pyrelational.datasets.base import BaseDataset 5 | 6 | from .utils import create_splits 7 | 8 | 9 | class DiabetesDataset(BaseDataset): 10 | """A small regression dataset for examples 11 | 12 | From Bradley Efron, Trevor Hastie, Iain Johnstone and 13 | Robert Tibshirani (2004) “Least Angle Regression,” 14 | Annals of Statistics (with discussion), 407-499. 15 | 16 | :param n_splits: an int describing the number of class stratified 17 | splits to compute 18 | """ 19 | 20 | def __init__(self, n_splits: int = 5, random_seed: int = 0): 21 | super().__init__(n_splits=n_splits, random_seed=random_seed) 22 | # Load the diabetes dataset 23 | diabetes_X, diabetes_y = load_diabetes(return_X_y=True) 24 | self.x = torch.FloatTensor(diabetes_X) 25 | self.y = torch.FloatTensor(diabetes_y) 26 | 27 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 28 | -------------------------------------------------------------------------------- /pyrelational/datasets/regression/synthetic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from sklearn.datasets import make_regression 4 | 5 | from pyrelational.datasets.base import BaseDataset 6 | 7 | from .utils import create_splits 8 | 9 | 10 | class SynthReg1(BaseDataset): 11 | """Synthetic dataset for active learning on a regression based task 12 | 13 | Simple 1 dof regression problem that can be placed into two types 14 | of AL situations as described in the module docstring 15 | 16 | :param n_splits: an int describing the number of class stratified 17 | splits to compute 18 | :param size: an int describing the number of observations the dataset 19 | is to have 20 | :param random_seed: random seed for reproducibility on splits 21 | """ 22 | 23 | def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234): 24 | super().__init__(n_splits=n_splits, random_seed=random_seed) 25 | self._create_data(size, random_seed) 26 | 27 | def _create_data(self, size: int, random_seed: int) -> None: 28 | x, y = make_regression( 29 | n_samples=size, 30 | n_features=1, 31 | n_targets=1, 32 | random_state=random_seed, 33 | ) 34 | 35 | self.x = torch.FloatTensor(x) 36 | self.y = torch.FloatTensor(y) 37 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 38 | 39 | 40 | class SynthReg2(BaseDataset): 41 | """Synthetic dataset for active learning on a regression based task 42 | 43 | A more challenging dataset than SynthReg1 wherein we see a periodic 44 | pattern with 2 degrees of freedom. 45 | 46 | :param n_splits: an int describing the number of class stratified 47 | splits to compute 48 | :param size: an int describing the number of observations the dataset 49 | is to have 50 | :param random_seed: random seed for reproducibility on splits 51 | """ 52 | 53 | def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234): 54 | super().__init__(n_splits=n_splits, random_seed=random_seed) 55 | self._create_data(size) 56 | 57 | def _create_data(self, size: int) -> None: 58 | zdata = 15 * np.random.random(size) 59 | xdata = np.sin(zdata) + 0.1 * np.random.randn(size) 60 | ydata = np.cos(zdata) + 0.1 * np.random.randn(size) 61 | 62 | zdata = torch.FloatTensor(zdata) 63 | xdata = torch.FloatTensor(xdata) 64 | ydata = torch.FloatTensor(ydata) 65 | 66 | self.x = torch.vstack([zdata, xdata]).T 67 | self.y = ydata 68 | self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed) 69 | -------------------------------------------------------------------------------- /pyrelational/datasets/regression/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | from numpy.typing import NDArray 5 | from sklearn.model_selection import KFold 6 | from torch import Tensor 7 | 8 | 9 | def create_splits(x: Tensor, y: Tensor, n_splits: int, random_seed: int) -> List[NDArray[np.int_]]: 10 | """ 11 | Create stratified k-fold splits for the dataset using the dataset's features and labels. 12 | """ 13 | skf = KFold(n_splits=n_splits, random_state=random_seed, shuffle=True) 14 | return list(skf.split(x.numpy(), y.numpy())) 15 | -------------------------------------------------------------------------------- /pyrelational/informativeness/__init__.py: -------------------------------------------------------------------------------- 1 | """Information acquisition strategies for active learning.""" 2 | 3 | from pyrelational.informativeness.classification_scorers import ( 4 | ClassificationBald, 5 | Entropy, 6 | LeastConfidence, 7 | MarginConfidence, 8 | RatioConfidence, 9 | ) 10 | from pyrelational.informativeness.regression_scorers import ( 11 | AverageScorer, 12 | ExpectedImprovement, 13 | RegressionBald, 14 | StandardDeviation, 15 | ThompsonSampling, 16 | UpperConfidenceBound, 17 | ) 18 | from pyrelational.informativeness.task_agnostic_scorers import RelativeDistanceScorer 19 | 20 | __all__ = [ 21 | "AverageScorer", 22 | "StandardDeviation", 23 | "ThompsonSampling", 24 | "RegressionBald", 25 | "ExpectedImprovement", 26 | "UpperConfidenceBound", 27 | "Entropy", 28 | "LeastConfidence", 29 | "MarginConfidence", 30 | "RatioConfidence", 31 | "ClassificationBald", 32 | "RelativeDistanceScorer", 33 | ] 34 | -------------------------------------------------------------------------------- /pyrelational/informativeness/decorators.py: -------------------------------------------------------------------------------- 1 | """Decorators for checking input shapes and types for scorers.""" 2 | 3 | from functools import wraps 4 | from typing import TYPE_CHECKING, Any, Callable, Optional, Union 5 | 6 | import torch 7 | from torch import Tensor 8 | from torch.distributions import Distribution 9 | 10 | if TYPE_CHECKING: 11 | from pyrelational.informativeness.abstract_scorers import ( 12 | AbstractClassificationScorer, 13 | AbstractRegressionScorer, 14 | ) 15 | 16 | 17 | def require_probabilities(func: Callable[..., Tensor]) -> Callable[..., Tensor]: 18 | """Ensure that the input tensor is a probability distribution.""" 19 | 20 | @wraps(func) 21 | def wrapper(self: "AbstractClassificationScorer", prob_dist: Tensor) -> Tensor: 22 | """Check the input tensor sums to 1 along axis.""" 23 | assert torch.allclose( 24 | prob_dist.sum(self.axis), torch.tensor(1.0) 25 | ), "input should be probability distributions along specified axis" 26 | return func(self, prob_dist) 27 | 28 | return wrapper 29 | 30 | 31 | def check_regression_input(func: Callable[..., Tensor]) -> Callable[..., Tensor]: 32 | """Check inputs for regression scoring functions.""" 33 | 34 | @wraps(func) 35 | def wrapper( 36 | self: "AbstractRegressionScorer", x: Optional[Union[Tensor, Distribution]] = None, **kwargs: Any 37 | ) -> Tensor: 38 | """Check shapes of input tensors.""" 39 | mean = kwargs.get("mean", None) 40 | std = kwargs.get("std", None) 41 | if x is None and mean is None and std is None: 42 | raise ValueError("At least one of x, mean, or std must be provided.") 43 | 44 | if isinstance(x, Tensor): 45 | assert 2 <= x.ndim <= 3, "x input should be a 2D or 3D tensor" 46 | 47 | if isinstance(mean, Tensor): 48 | assert 1 <= mean.ndim <= 2, "mean input should be a 1D or 2D tensor" 49 | 50 | if isinstance(std, Tensor): 51 | assert 1 <= std.ndim <= 2, "std input should be a 1D or 2D tensor" 52 | 53 | return func(self, x, **kwargs) 54 | 55 | return wrapper 56 | -------------------------------------------------------------------------------- /pyrelational/model_managers/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.model_managers.abstract_model_manager import ModelManager 2 | from pyrelational.model_managers.ensemble_model_manager import ( 3 | EnsembleModelManager, 4 | LightningEnsembleModelManager, 5 | ) 6 | from pyrelational.model_managers.lightning_model_manager import LightningModelManager 7 | from pyrelational.model_managers.mcdropout_model_manager import ( 8 | LightningMCDropoutModelManager, 9 | MCDropoutModelManager, 10 | ) 11 | -------------------------------------------------------------------------------- /pyrelational/model_managers/model_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import torch 4 | from lightning.pytorch.trainer.connectors.accelerator_connector import ( 5 | _AcceleratorConnector, 6 | ) 7 | 8 | 9 | def _determine_device(trainer_config: Dict[str, Any]) -> torch.device: 10 | """ 11 | Determines the torch device of the model from the arguments for the pytorch lightning trainer 12 | 13 | :param trainer_config: configuration dictionary for a pytorch lightning Trainer 14 | :return: torch device object 15 | """ 16 | accelerator = _AcceleratorConnector( 17 | accelerator=trainer_config.get("accelerator", "cpu"), devices=trainer_config.get("devices", "auto") 18 | ) 19 | device: torch.device = accelerator.strategy.root_device 20 | return device 21 | -------------------------------------------------------------------------------- /pyrelational/oracles/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.oracles.abstract_oracle import Oracle 2 | from pyrelational.oracles.benchmark_oracle import BenchmarkOracle 3 | -------------------------------------------------------------------------------- /pyrelational/oracles/benchmark_oracle.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from pyrelational.data_managers.data_manager import DataManager 4 | 5 | from .abstract_oracle import Oracle 6 | 7 | 8 | class BenchmarkOracle(Oracle): 9 | """ 10 | An oracle designed for evaluating strategies in R&D settings, 11 | it assumes that all the observations are sufficiently annotated and 12 | returns those annotations when queried. 13 | """ 14 | 15 | def __init__(self) -> None: 16 | super(BenchmarkOracle, self).__init__() 17 | 18 | def query_target_value(self, data_manager: DataManager, idx: int) -> Any: 19 | """Default method is to simply return the target in the dataset 20 | 21 | :param data_manager: reference to the data_manager which will load the observation if necessary 22 | :param idx: index to observation which we want to query an annotation 23 | 24 | :return: the output of the oracle (the target value already in the dataset) 25 | """ 26 | target_value = data_manager[idx][-1] 27 | return target_value 28 | -------------------------------------------------------------------------------- /pyrelational/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.pipeline.pipeline import Pipeline 2 | -------------------------------------------------------------------------------- /pyrelational/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | import pyrelational.strategies.classification 2 | import pyrelational.strategies.regression 3 | import pyrelational.strategies.task_agnostic 4 | from pyrelational.strategies.abstract_strategy import Strategy 5 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from pyrelational.strategies.classification.classification_strategy import ( 2 | ClassificationStrategy, 3 | ) 4 | from pyrelational.strategies.classification.entropy_classification_strategy import ( 5 | EntropyClassificationStrategy, 6 | ) 7 | from pyrelational.strategies.classification.least_confidence_strategy import ( 8 | LeastConfidenceStrategy, 9 | ) 10 | from pyrelational.strategies.classification.marginal_confidence_strategy import ( 11 | MarginalConfidenceStrategy, 12 | ) 13 | from pyrelational.strategies.classification.ratio_confidence_strategy import ( 14 | RatioConfidenceStrategy, 15 | ) 16 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/classification_strategy.py: -------------------------------------------------------------------------------- 1 | """ClassificationStrategy class for active learning in classification tasks.""" 2 | 3 | import math 4 | from typing import Any, List 5 | 6 | import torch 7 | from torch import Tensor 8 | 9 | from pyrelational.data_managers import DataManager 10 | from pyrelational.model_managers import ModelManager 11 | from pyrelational.strategies.abstract_strategy import Strategy 12 | 13 | 14 | class ClassificationStrategy(Strategy): 15 | """A base active learning strategy class for classification.""" 16 | 17 | def __call__( 18 | self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any] 19 | ) -> List[int]: 20 | """ 21 | Identify samples for labelling based on user defined scoring and sampling function. 22 | 23 | :param num_annotate: number of samples to annotate 24 | :param data_manager: A pyrelational data manager 25 | which keeps track of what has been labelled and creates data loaders for 26 | active learning 27 | :param model_manager: A pyrelational model manager 28 | which wraps a user defined ML model to handle instantiation, training, testing, 29 | as well as uncertainty quantification 30 | 31 | :return: list of indices to annotate 32 | """ 33 | output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager).mean(0) 34 | if not torch.allclose(output.sum(1), torch.tensor(1.0)): 35 | output = softmax(output) 36 | uncertainty = self.scorer(output) 37 | return self.sampler(uncertainty, data_manager.u_indices, num_annotate) 38 | 39 | 40 | def softmax(scores: Tensor, base: float = math.e, axis: int = -1) -> Tensor: 41 | """Return softmax array for array of scores. 42 | 43 | Converts a set of raw scores from a model (logits) into a 44 | probability distribution via softmax. 45 | 46 | The probability distribution will be a set of real numbers 47 | such that each is in the range 0-1.0 and the sum is 1.0. 48 | 49 | Assumes input is a pytorch tensor: tensor([1.0, 4.0, 2.0, 3.0]) 50 | 51 | :param scores: (pytorch tensor) a pytorch tensor of any positive/negative real numbers. 52 | :param base: the base for the exponential (default e) 53 | :param: axis to apply softmax on scores 54 | 55 | :return: tensor of softmaxed scores 56 | """ 57 | exps = base ** scores.float() # exponential for each value in array 58 | sum_exps = torch.sum(exps, dim=axis, keepdim=True) # sum of all exponentials 59 | prob_dist: Tensor = exps / sum_exps # normalize exponentials 60 | return prob_dist 61 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/entropy_classification_strategy.py: -------------------------------------------------------------------------------- 1 | """Active learning using entropy based confidence uncertainty measure. 2 | 3 | The score is computed between classes in the posterior predictive distribution to 4 | choose which observations to propose to the oracle. 5 | """ 6 | 7 | from pyrelational.batch_mode_samplers import TopKSampler 8 | from pyrelational.informativeness import Entropy 9 | from pyrelational.strategies.classification.classification_strategy import ( 10 | ClassificationStrategy, 11 | ) 12 | 13 | 14 | class EntropyClassificationStrategy(ClassificationStrategy): 15 | """Implements Entropy Classification Strategy.""" 16 | 17 | def __init__(self, axis: int = -1): 18 | """Initialise the strategy with entropy scorer and deterministic sampler.""" 19 | super().__init__(Entropy(axis=axis), TopKSampler()) 20 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/least_confidence_strategy.py: -------------------------------------------------------------------------------- 1 | """Active learning using least confidence uncertainty measure.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import LeastConfidence 5 | from pyrelational.strategies.classification.classification_strategy import ( 6 | ClassificationStrategy, 7 | ) 8 | 9 | 10 | class LeastConfidenceStrategy(ClassificationStrategy): 11 | """Implements Least Confidence Strategy. 12 | 13 | Unlabelled samples are scored and queried based on the least confidence for classification scorer. 14 | """ 15 | 16 | def __init__(self, axis: int = -1): 17 | """Initialize the strategy with the least confidence scorer and a deterministic scorer for classification.""" 18 | super().__init__(LeastConfidence(axis=axis), TopKSampler()) 19 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/marginal_confidence_strategy.py: -------------------------------------------------------------------------------- 1 | """Active learning using marginal confidence uncertainty measure.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import MarginConfidence 5 | from pyrelational.strategies.classification.classification_strategy import ( 6 | ClassificationStrategy, 7 | ) 8 | 9 | 10 | class MarginalConfidenceStrategy(ClassificationStrategy): 11 | """Implements Marginal Confidence Strategy. 12 | 13 | Unlabelled samples are scored and queried based on the marginal confidence for classification scorer. 14 | """ 15 | 16 | def __init__(self, axis: int = -1): 17 | """Initialize the strategy with the marginal confidence scorer and a deterministic scorer for classification.""" 18 | super().__init__(MarginConfidence(axis=axis), TopKSampler()) 19 | -------------------------------------------------------------------------------- /pyrelational/strategies/classification/ratio_confidence_strategy.py: -------------------------------------------------------------------------------- 1 | """Active learning using ratio based confidence uncertainty measure.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import RatioConfidence 5 | from pyrelational.strategies.classification.classification_strategy import ( 6 | ClassificationStrategy, 7 | ) 8 | 9 | 10 | class RatioConfidenceStrategy(ClassificationStrategy): 11 | """Implements Ratio Confidence Strategy. 12 | 13 | Unlabelled samples are scored and queried based on the ratio confidence for classification scorer. 14 | """ 15 | 16 | def __init__(self, axis: int = -1): 17 | """Initialize the strategy with the ratio confidence scorer and a deterministic scorer for classification.""" 18 | super().__init__(RatioConfidence(axis=axis), TopKSampler()) 19 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/__init__.py: -------------------------------------------------------------------------------- 1 | """Regression strategies for active learning.""" 2 | 3 | from pyrelational.strategies.regression.bald_strategy import ( 4 | BALDStrategy, 5 | SoftBALDStrategy, 6 | ) 7 | from pyrelational.strategies.regression.expected_improvement_strategy import ( 8 | ExpectedImprovementStrategy, 9 | ) 10 | from pyrelational.strategies.regression.greedy_strategy import GreedyStrategy 11 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 12 | from pyrelational.strategies.regression.thompson_sampling_strategy import ( 13 | ThompsonSamplingStrategy, 14 | ) 15 | from pyrelational.strategies.regression.upper_confidence_bound_strategy import ( 16 | UpperConfidenceBoundStrategy, 17 | ) 18 | from pyrelational.strategies.regression.variance_reduction_strategy import ( 19 | VarianceReductionStrategy, 20 | ) 21 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/bald_strategy.py: -------------------------------------------------------------------------------- 1 | """BALD Strategy for regression tasks.""" 2 | 3 | from pyrelational.batch_mode_samplers import ProbabilisticSampler, TopKSampler 4 | from pyrelational.informativeness import RegressionBald 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 6 | 7 | 8 | class BALDStrategy(RegressionStrategy): 9 | """Implements BALD Strategy. 10 | 11 | Samples are queried based on mutual information score based on multiple estimator models. 12 | """ 13 | 14 | def __init__(self, axis: int = 0): 15 | """Initialise the strategy with bald scorer and deterministic sampler.""" 16 | super().__init__(RegressionBald(axis=axis), TopKSampler()) 17 | 18 | 19 | class SoftBALDStrategy(RegressionStrategy): 20 | """Implements soft BALD Strategy. 21 | 22 | Unlabelled samples are queried based on mutual information score based on 23 | multiple estimator models. In contrast to Bald the query is drawn from unlabelled pool based on probabilities 24 | derived from bald scores instead of using an argmax operation. 25 | """ 26 | 27 | def __init__(self, axis: int = 0): 28 | """Initialise the strategy with bald scorer and probabilistic sampler.""" 29 | super().__init__(RegressionBald(axis=axis), ProbabilisticSampler()) 30 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/expected_improvement_strategy.py: -------------------------------------------------------------------------------- 1 | """Implement Expected Improvement Strategy for regression tasks.""" 2 | 3 | from typing import Any, List 4 | 5 | import torch 6 | 7 | from pyrelational.batch_mode_samplers import TopKSampler 8 | from pyrelational.data_managers import DataManager 9 | from pyrelational.informativeness import ExpectedImprovement 10 | from pyrelational.model_managers import ModelManager 11 | from pyrelational.strategies.abstract_strategy import Strategy 12 | 13 | 14 | class ExpectedImprovementStrategy(Strategy): 15 | """Implement Expected Improvement Strategy. 16 | 17 | Unlabelled sample is scored based on the expected improvement scoring function. 18 | """ 19 | 20 | scorer: ExpectedImprovement 21 | 22 | def __init__(self, xi: float = 0.01, axis: int = 0) -> None: 23 | """Initialize the strategy with the expected improvement scorer and a deterministic sampler for regression.""" 24 | super().__init__(ExpectedImprovement(xi=xi, axis=axis), TopKSampler()) 25 | 26 | def __call__( 27 | self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any] 28 | ) -> List[int]: 29 | """ 30 | Identify samples which need to be labelled. 31 | 32 | :param num_annotate: number of samples to annotate 33 | :param data_manager: A pyrelational data manager 34 | which keeps track of what has been labelled and creates data loaders for 35 | active learning 36 | :param model_manager: A pyrelational model manager 37 | which wraps a user defined ML model to handle instantiation, training, testing, 38 | as well as uncertainty quantification 39 | 40 | :return: list of indices to annotate 41 | """ 42 | output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager) 43 | max_label = torch.max(data_manager.get_sample_labels(data_manager.l_indices)) 44 | uncertainty = self.scorer(output, max_label=max_label) 45 | return self.sampler(uncertainty, data_manager.u_indices, num_annotate) 46 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/greedy_strategy.py: -------------------------------------------------------------------------------- 1 | """Greedy Strategy Module.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import AverageScorer 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 6 | 7 | 8 | class GreedyStrategy(RegressionStrategy): 9 | """Implements Greedy Strategy. 10 | 11 | Unlabelled samples are queried based on their predicted mean value by the model. 12 | ie samples with the highest predicted mean values are queried. 13 | """ 14 | 15 | def __init__(self, axis: int = 0): 16 | """Initialize the strategy with the mean prediction scorer and a deterministic scorer for regression.""" 17 | super().__init__(AverageScorer(axis=axis), TopKSampler()) 18 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/regression_strategy.py: -------------------------------------------------------------------------------- 1 | """Regression strategy class implementing __call__ logic.""" 2 | 3 | from typing import Any, List 4 | 5 | from pyrelational.data_managers import DataManager 6 | from pyrelational.model_managers import ModelManager 7 | from pyrelational.strategies.abstract_strategy import Strategy 8 | 9 | 10 | class RegressionStrategy(Strategy): 11 | """A base active learning strategy class for regression.""" 12 | 13 | def __call__( 14 | self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any] 15 | ) -> List[int]: 16 | """ 17 | Identify samples for labelling based on user defined scoring and sampling function. 18 | 19 | :param num_annotate: number of samples to annotate 20 | :param data_manager: A pyrelational data manager 21 | which keeps track of what has been labelled and creates data loaders for 22 | active learning 23 | :param model_manager: A pyrelational model manager 24 | which wraps a user defined ML model to handle instantiation, training, testing, 25 | as well as uncertainty quantification 26 | 27 | :return: list of indices to annotate 28 | """ 29 | output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager) 30 | scores = self.scorer(output) 31 | if scores.shape[0] != 1: 32 | scores = scores.squeeze(-1) 33 | return self.sampler(scores, data_manager.u_indices, num_annotate) 34 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/thompson_sampling_strategy.py: -------------------------------------------------------------------------------- 1 | """Thomas Sampling Strategy for Regression.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import ThompsonSampling 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 6 | 7 | 8 | class ThompsonSamplingStrategy(RegressionStrategy): 9 | """Implements Thompson Sampling Strategy. 10 | 11 | Unlabelled samples are scored and queried based on the thompson sampling scorer. 12 | """ 13 | 14 | def __init__(self, axis: int = 0): 15 | """Initialize the strategy with the thompson sampling scorer and a deterministic scorer for regression.""" 16 | super().__init__(ThompsonSampling(axis=axis), TopKSampler()) 17 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/upper_confidence_bound_strategy.py: -------------------------------------------------------------------------------- 1 | """Upper Confidence Bound Strategy.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import UpperConfidenceBound 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 6 | 7 | 8 | class UpperConfidenceBoundStrategy(RegressionStrategy): 9 | """Implements Upper Confidence Bound Strategy. 10 | 11 | Unlabelled samples are scored and queried based on the UCB scorer. 12 | """ 13 | 14 | def __init__(self, kappa: float = 1.0, axis: int = 0): 15 | """Initialize the strategy with the UCB scorer and a deterministic scorer for regression. 16 | 17 | :param kappa: trade-off parameter between exploitation and exploration 18 | """ 19 | super().__init__(UpperConfidenceBound(kappa=kappa, axis=axis), TopKSampler()) 20 | -------------------------------------------------------------------------------- /pyrelational/strategies/regression/variance_reduction_strategy.py: -------------------------------------------------------------------------------- 1 | """Least Confidence Strategy for regression tasks.""" 2 | 3 | from pyrelational.batch_mode_samplers import TopKSampler 4 | from pyrelational.informativeness import StandardDeviation 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy 6 | 7 | 8 | class VarianceReductionStrategy(RegressionStrategy): 9 | """Implements Least Confidence Strategy. 10 | 11 | Unlabelled samples are queried based on their predicted variance by the model. 12 | """ 13 | 14 | def __init__(self, axis: int = 0): 15 | """Initialize the strategy with the least confidence scorer and a deterministic scorer for regression.""" 16 | super().__init__(StandardDeviation(axis=axis), TopKSampler()) 17 | -------------------------------------------------------------------------------- /pyrelational/strategies/task_agnostic/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from pyrelational.strategies.task_agnostic.random_acquisition_strategy import ( 3 | RandomAcquisitionStrategy, 4 | ) 5 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import ( 6 | RelativeDistanceStrategy, 7 | ) 8 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import ( 9 | RepresentativeSamplingStrategy, 10 | ) 11 | -------------------------------------------------------------------------------- /pyrelational/strategies/task_agnostic/random_acquisition_strategy.py: -------------------------------------------------------------------------------- 1 | """Defines and implements a random acquisition active learning strategy.""" 2 | 3 | from typing import List 4 | 5 | import numpy as np 6 | 7 | from pyrelational.data_managers import DataManager 8 | from pyrelational.strategies.abstract_strategy import Strategy 9 | 10 | 11 | class RandomAcquisitionStrategy(Strategy): 12 | """Implements RandomAcquisition whereby random samples from unlabelled set are chosen at each step.""" 13 | 14 | def __init__(self) -> None: 15 | """Override init method to do nothing. This strategy does not require any initialization.""" 16 | pass 17 | 18 | def __call__(self, num_annotate: int, data_manager: DataManager) -> List[int]: 19 | """ 20 | Identify samples for labelling based on random sampling. 21 | 22 | :param num_annotate: number of samples to annotate 23 | :param data_manager: A pyrelational data manager 24 | which keeps track of what has been labelled and creates data loaders for 25 | active learning 26 | 27 | :return: list of indices to annotate 28 | """ 29 | num_annotate = min(num_annotate, len(data_manager.u_indices)) 30 | ret: List[int] = np.random.choice(data_manager.u_indices, size=num_annotate, replace=False).tolist() 31 | return ret 32 | -------------------------------------------------------------------------------- /pyrelational/strategies/task_agnostic/relative_distance_strategy.py: -------------------------------------------------------------------------------- 1 | """Relative distance based active learning strategy.""" 2 | 3 | from typing import List 4 | 5 | from pyrelational.batch_mode_samplers import TopKSampler 6 | from pyrelational.data_managers import DataManager 7 | from pyrelational.informativeness import RelativeDistanceScorer 8 | from pyrelational.strategies.abstract_strategy import Strategy 9 | 10 | 11 | class RelativeDistanceStrategy(Strategy): 12 | """Diversity sampling based active learning strategy.""" 13 | 14 | scorer: RelativeDistanceScorer 15 | 16 | def __init__(self, metric: str = "euclidean"): 17 | """Initialise the strategy with a distance metric. 18 | 19 | :param metric: Name of distance metric to use. This should be supported by scikit-learn 20 | pairwise_distances function. 21 | """ 22 | self.metric = metric 23 | super().__init__(RelativeDistanceScorer(metric=metric), TopKSampler()) 24 | 25 | def __call__(self, num_annotate: int, data_manager: DataManager) -> List[int]: 26 | """Identify samples which need to be labelled. 27 | 28 | :param num_annotate: number of samples to annotate 29 | :param data_manager: A pyrelational data manager 30 | which keeps track of what has been labelled and creates data loaders for 31 | active learning 32 | 33 | :return: list of indices to annotate 34 | """ 35 | scores = self.scorer(data_manager.get_unlabelled_loader(), data_manager.get_labelled_loader()) 36 | return self.sampler(scores, data_manager.u_indices, num_annotate) 37 | -------------------------------------------------------------------------------- /pyrelational/types.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Sized, TypeVar 3 | 4 | from torch.utils.data import Dataset 5 | 6 | T = TypeVar("T") 7 | 8 | 9 | class SizedDataset(Dataset[T], Sized, ABC): ... 10 | -------------------------------------------------------------------------------- /pyrelational/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.0.2" 2 | -------------------------------------------------------------------------------- /requirements/base_requirements.txt: -------------------------------------------------------------------------------- 1 | lightning>=1.8.6, <2.5.0 2 | numpy<2.0.0 3 | openpyxl>=3.0.9 4 | pandas>=1.3.0, <2.2.3 5 | pyreadr>=0.4.4 6 | requests==2.32.3 7 | scikit-learn>=1.5.1, <1.5.2 8 | tabulate>=0.7.0, <=0.9.0 9 | torch>=1.9.0, <2.3.0 10 | torchvision>=0.10.0, <0.18.0 11 | xlrd>=2.0.1 12 | rdkit==2023.9.5 13 | -------------------------------------------------------------------------------- /requirements/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | black==24.4.2 2 | flake8==7.1.1 3 | flake8-bugbear==24.4.26 4 | isort==5.13.2 5 | mypy==1.11.2 6 | parameterized==0.9.0 7 | pre-commit==3.8.0 8 | protobuf>=3.19.0 9 | pytest==8.2.2 10 | pytest-cov 11 | setuptools>=59.5.0 12 | -r base_requirements.txt 13 | -------------------------------------------------------------------------------- /requirements/doc_requirements.txt: -------------------------------------------------------------------------------- 1 | -r dev_requirements.txt 2 | sphinx 3 | sphinx_rtd_theme -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from setuptools import find_packages, setup 4 | 5 | """ 6 | pip install -e . 7 | """ 8 | 9 | setup_requires = ["pytest-runner"] 10 | tests_require = ["pytest", "pytest-cov", "mock"] 11 | 12 | with open("requirements/base_requirements.txt", "r") as req: 13 | install_requires = [line.strip() for line in req if line.strip()] 14 | 15 | with open("README.md", "r", encoding="utf-8") as fh: 16 | long_description = fh.read() 17 | 18 | version: Dict[str, str] = {} 19 | with open("pyrelational/version.py") as fp: 20 | exec(fp.read(), version) 21 | 22 | setup( 23 | name="pyrelational", 24 | description="Python tool box for quickly implementing active learning strategies", 25 | author="Relation Therapeutics", 26 | author_email="software@relationrx.com", 27 | long_description=long_description, 28 | long_description_content_type="text/markdown", 29 | url="https://github.com/RelationRx/pyrelational", 30 | packages=find_packages(), 31 | version=version["__version__"], 32 | setup_requires=setup_requires, 33 | tests_require=tests_require, 34 | install_requires=install_requires, 35 | python_requires=">=3.9", 36 | ) 37 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/__init__.py -------------------------------------------------------------------------------- /tests/data_managers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/data_managers/__init__.py -------------------------------------------------------------------------------- /tests/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/datasets/__init__.py -------------------------------------------------------------------------------- /tests/datasets/test_benchmark_datamanager.py: -------------------------------------------------------------------------------- 1 | """Unit tests for benchmark datamanager functions 2 | """ 3 | 4 | from unittest import TestCase 5 | 6 | from pyrelational.datasets import BreastCancerDataset, DiabetesDataset 7 | from pyrelational.datasets.benchmark_datamanager import ( 8 | create_classification_cold_start, 9 | create_regression_cold_start, 10 | create_warm_start, 11 | ) 12 | 13 | 14 | class TestBenchmarkDataManager(TestCase): 15 | """Class containing unit tests for benchmark datamanager creation.""" 16 | 17 | def test_create_warm_start_classification(self) -> None: 18 | """Check shape correctness of dataset.""" 19 | dataset = BreastCancerDataset() 20 | train_indices = list(dataset.data_splits[0][0]) 21 | test_indices = list(dataset.data_splits[0][1]) 22 | dm = create_warm_start(dataset, train_indices=train_indices, test_indices=test_indices) 23 | self.assertEqual(len(dm), 569) 24 | 25 | def test_create_warm_start_regression(self) -> None: 26 | """Check shape correctness of dataset.""" 27 | dataset = DiabetesDataset() 28 | train_indices = list(dataset.data_splits[0][0]) 29 | test_indices = list(dataset.data_splits[0][1]) 30 | dm = create_warm_start(dataset, train_indices=train_indices, test_indices=test_indices) 31 | self.assertEqual(len(dm), 442) 32 | 33 | def test_create_classification_cold_start(self) -> None: 34 | """Check shape correctness of dataset.""" 35 | dataset = BreastCancerDataset() 36 | train_indices = list(dataset.data_splits[0][0]) 37 | test_indices = list(dataset.data_splits[0][1]) 38 | dm = create_classification_cold_start(dataset, train_indices=train_indices, test_indices=test_indices) 39 | self.assertEqual(len(dm), 569) 40 | self.assertEqual(len(dm.l_indices), 2) 41 | 42 | def test_create_regression_cold_start(self) -> None: 43 | """Check shape correctness of dataset.""" 44 | dataset = DiabetesDataset() 45 | train_indices = list(dataset.data_splits[0][0]) 46 | test_indices = list(dataset.data_splits[0][1]) 47 | dm = create_regression_cold_start(dataset, train_indices=train_indices, test_indices=test_indices) 48 | self.assertEqual(len(dm), 442) 49 | self.assertEqual(len(dm.l_indices), 2) 50 | -------------------------------------------------------------------------------- /tests/datasets/test_uci_datasets.py: -------------------------------------------------------------------------------- 1 | """Unit tests for uci dataset downloader 2 | """ 3 | 4 | import os 5 | import shutil 6 | from unittest import TestCase 7 | 8 | from parameterized import parameterized_class 9 | 10 | from pyrelational.datasets import UCIDatasets 11 | 12 | 13 | @parameterized_class([{"data_name": k} for k in UCIDatasets.datasets.keys()]) 14 | class TestUCIBenchmarkDatasets(TestCase): 15 | """Class containing unit tests on UCI benchmark datasets.""" 16 | 17 | def setUp(self) -> None: 18 | """Set up class.""" 19 | self.dataset = UCIDatasets(self.data_name, data_dir="test_data/", n_splits=10) 20 | 21 | def test_number_splits(self) -> None: 22 | """Check number of splits.""" 23 | dataset = UCIDatasets("glass", data_dir="test_data/", n_splits=10) 24 | self.assertEqual(dataset.n_splits, 10) 25 | self.assertEqual(len(dataset.data_splits), 10) 26 | 27 | def test_split_size(self): 28 | """Check size of train and test splits.""" 29 | split = self.dataset.get_split(train=True) 30 | self.assertEqual(len(split), len(self.dataset.data_splits[0][0])) 31 | 32 | split = self.dataset.get_split(train=False) 33 | self.assertEqual(len(split), len(self.dataset.data_splits[0][1])) 34 | 35 | def test_full_split_length(self) -> None: 36 | """Check full split length.""" 37 | split = self.dataset.get_full_split() 38 | self.assertEqual(len(split), len(self.dataset.data)) 39 | 40 | def test_get_simple_data(self) -> None: 41 | """Check size of returned simple dataset.""" 42 | x, y = self.dataset.get_data() 43 | self.assertEqual(len(x), len(self.dataset.data)) 44 | self.assertEqual(len(y), len(self.dataset.data)) 45 | 46 | def tearDown(self) -> None: 47 | """Tear down class.""" 48 | if os.path.exists("test_data/"): 49 | shutil.rmtree("test_data") 50 | -------------------------------------------------------------------------------- /tests/informativeness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/informativeness/__init__.py -------------------------------------------------------------------------------- /tests/model_managers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/model_managers/__init__.py -------------------------------------------------------------------------------- /tests/model_managers/test_ensemble_model_manager.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import pytest 4 | import torch 5 | 6 | from pyrelational.model_managers.ensemble_model_manager import ( 7 | LightningEnsembleModelManager, 8 | ) 9 | from tests.test_utils import BreastCancerClassifier, get_classification_dataset 10 | 11 | 12 | class TestEnsembleEstimator(TestCase): 13 | """Class containing unit tests for ensemble pyrelational model.""" 14 | 15 | def setUp(self) -> None: 16 | """Set up shared attributes""" 17 | self.num_estimators = 4 18 | self.model = LightningEnsembleModelManager( 19 | BreastCancerClassifier, {}, {"epochs": 1}, n_estimators=self.num_estimators 20 | ) 21 | self.dataset = get_classification_dataset() 22 | self.train_loader = self.dataset.get_train_loader() 23 | self.val_loader = self.dataset.get_validation_loader() 24 | 25 | def test_instantiation(self) -> None: 26 | """Check attributes at instantiation.""" 27 | self.assertEqual(self.model.__class__.__name__, "LightningEnsembleModelManager") 28 | self.assertIsNone(self.model._current_model) 29 | self.assertIsInstance(self.model.trainer_config, dict) 30 | self.assertIsInstance(self.model.model_config, dict) 31 | 32 | def test_fail_on_test_without_train(self) -> None: 33 | """Check error is raised when testing without training first.""" 34 | with pytest.raises(ValueError) as err: 35 | self.model.test(self.val_loader) 36 | self.assertEqual( 37 | str(err.value), "No current model, call 'train(train_loader, valid_loader)' to train the model first" 38 | ) 39 | 40 | def test_prediction(self) -> None: 41 | """Check dimension match with number of estimators or dataset size.""" 42 | self.model.train(self.train_loader) 43 | self.assertEqual(len(self.model._current_model), self.num_estimators) 44 | 45 | prediction = self.model(self.val_loader) 46 | self.assertEqual(prediction.size(0), self.num_estimators) 47 | self.assertEqual(prediction.size(1), len(self.dataset.validation_indices)) 48 | self.assertIsInstance(prediction, torch.Tensor) 49 | self.assertIsInstance(self.model.test(self.val_loader), dict) 50 | -------------------------------------------------------------------------------- /tests/model_managers/test_model_managers.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | from unittest import TestCase 3 | 4 | import torch 5 | from torch.utils.data import DataLoader 6 | 7 | from pyrelational.model_managers import ( 8 | LightningEnsembleModelManager, 9 | LightningMCDropoutModelManager, 10 | LightningModelManager, 11 | ) 12 | from tests.test_utils import DiabetesDataset, DiabetesRegressionModel 13 | 14 | 15 | class TestModelManager(TestCase): 16 | """Class containing unit tests for pyrelational models.""" 17 | 18 | def test_lightning_model(self) -> None: 19 | """ 20 | Check that 21 | 1) model is stored after training 22 | 2) output of test loop is a dictionary 23 | 3) shape of tensor output of __call__ 24 | """ 25 | train_loader, valid_loader, test_loader = get_loaders() 26 | model = LightningModelManager(DiabetesRegressionModel, {}, {"epochs": 3}) 27 | model.train(train_loader, valid_loader) 28 | self.assertIsNotNone(model._current_model) 29 | self.assertIsInstance(model.test(test_loader), dict) 30 | self.assertEqual(model(test_loader).size(0), len(test_loader.dataset)) 31 | 32 | def test_early_stopping_in_trainer_callbacks(self) -> None: 33 | """Check that EarlyStopping is one of the callbacks in a pyrelational LightningModelManager.""" 34 | model = LightningModelManager( 35 | DiabetesRegressionModel, {}, {"epochs": 3, "use_early_stopping": True, "patience": 10} 36 | ) 37 | trainer, _ = model.init_trainer() 38 | self.assertTrue(any(["EarlyStopping" in str(cb) for cb in trainer.callbacks])) 39 | 40 | 41 | def get_loaders() -> Tuple[DataLoader, DataLoader, DataLoader]: 42 | """Create train/val/test dataloaders from sklearn diabetes dataset.""" 43 | ds = DiabetesDataset() 44 | train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [350, 50, 42]) 45 | 46 | train_loader = DataLoader(train_ds, batch_size=10) 47 | valid_loader = DataLoader(valid_ds, batch_size=10) 48 | test_loader = DataLoader(test_ds, batch_size=10) 49 | return train_loader, valid_loader, test_loader 50 | -------------------------------------------------------------------------------- /tests/oracles/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/oracles/__init__.py -------------------------------------------------------------------------------- /tests/oracles/test_oracles.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from pyrelational.oracles.benchmark_oracle import BenchmarkOracle 4 | from tests.test_utils import get_classification_dataset 5 | 6 | 7 | class TestOracle(TestCase): 8 | """Class containing unit tests for oracles""" 9 | 10 | def setUp(self) -> None: 11 | """Set up datamager.""" 12 | self.datamanager = get_classification_dataset() 13 | 14 | def test_update_annotations(self) -> None: 15 | """Check update_annotations method updates unlabelled and labelled sets.""" 16 | random_u_sindex = self.datamanager.u_indices[0] 17 | len_dm_l = len(self.datamanager.l_indices) 18 | len_dm_u = len(self.datamanager.u_indices) 19 | 20 | BenchmarkOracle.update_annotations(self.datamanager, [random_u_sindex]) 21 | self.assertIn(random_u_sindex, self.datamanager.l_indices) 22 | self.assertGreater(len(self.datamanager.l_indices), len_dm_l) 23 | self.assertGreater(len_dm_u, len(self.datamanager.u_indices)) 24 | 25 | def test_query_target_value(self) -> None: 26 | """Check query target value of benchmark oracle return correct values.""" 27 | oracle = BenchmarkOracle() 28 | value = oracle.query_target_value(self.datamanager, 0) 29 | self.assertEqual(value, self.datamanager[0][-1]) 30 | 31 | def test_update_target_value(self) -> None: 32 | """Check update_target_value method updates dataset correctly.""" 33 | BenchmarkOracle.update_target_value(self.datamanager, 0, 42) 34 | self.assertEqual(self.datamanager[0][-1], 42) 35 | 36 | def test_update_target_values(self) -> None: 37 | """Test that update_target_values method change all values in dataset.""" 38 | ixs, vals = [0, 1, 2], [42, 42, 42] 39 | BenchmarkOracle.update_target_values(self.datamanager, ixs, vals) 40 | self.assertEqual([self.datamanager[i][-1] for i in ixs], vals) 41 | -------------------------------------------------------------------------------- /tests/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/pipeline/__init__.py -------------------------------------------------------------------------------- /tests/samplers/test_samplers.py: -------------------------------------------------------------------------------- 1 | """Tests for the samplers module.""" "" 2 | from unittest import TestCase 3 | 4 | import torch 5 | 6 | from pyrelational.batch_mode_samplers import ProbabilisticSampler, TopKSampler 7 | 8 | 9 | class TestSamplers(TestCase): 10 | """Collection of tests for samplers.""" 11 | 12 | def test_deterministic_sampler(self) -> None: 13 | """Test deterministic sampler.""" 14 | sampler = TopKSampler() 15 | query = sampler(torch.tensor([0.1, 3.0, 2.1]), [1, 2, 3], 1) 16 | self.assertEqual(len(query), 1) 17 | self.assertEqual(query, [2]) 18 | 19 | def test_probabilistic_sampler(self) -> None: 20 | """Test probabilistic sampler.""" 21 | sampler = ProbabilisticSampler() 22 | query = sampler(torch.tensor([0.1, 0.2, 0.7]), [1, 2, 3], 1) 23 | self.assertEqual(len(query), 1) 24 | -------------------------------------------------------------------------------- /tests/strategies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/strategies/__init__.py -------------------------------------------------------------------------------- /tests/strategies/_agnostic_strategy_test_cases.py: -------------------------------------------------------------------------------- 1 | from sklearn.cluster import AgglomerativeClustering 2 | 3 | from pyrelational.strategies.task_agnostic import ( 4 | RandomAcquisitionStrategy, 5 | RelativeDistanceStrategy, 6 | RepresentativeSamplingStrategy, 7 | ) 8 | 9 | TASK_AGNOSTIC_TEST_CASES = [ 10 | {"task_type": "regression", "strategy_class": RandomAcquisitionStrategy, "strategy_kwargs": {}}, 11 | {"task_type": "regression", "strategy_class": RelativeDistanceStrategy, "strategy_kwargs": {}}, 12 | { 13 | "task_type": "regression", 14 | "strategy_class": RelativeDistanceStrategy, 15 | "strategy_kwargs": {"metric": "cosine"}, 16 | }, 17 | { 18 | "task_type": "regression", 19 | "strategy_class": RepresentativeSamplingStrategy, 20 | "strategy_kwargs": {"clustering_method": "AffinityPropagation"}, 21 | }, 22 | { 23 | "task_type": "regression", 24 | "strategy_class": RepresentativeSamplingStrategy, 25 | "strategy_kwargs": {"clustering_method": AgglomerativeClustering(n_clusters=10)}, 26 | }, 27 | ] 28 | -------------------------------------------------------------------------------- /tests/strategies/_classification_strategy_test_cases.py: -------------------------------------------------------------------------------- 1 | from pyrelational.strategies.classification import ( 2 | EntropyClassificationStrategy, 3 | LeastConfidenceStrategy, 4 | MarginalConfidenceStrategy, 5 | RatioConfidenceStrategy, 6 | ) 7 | 8 | CLASSIFICATION_TEST_CASES = [ 9 | { 10 | "task_type": "classification", 11 | "strategy_class": EntropyClassificationStrategy, 12 | "strategy_kwargs": {}, 13 | }, 14 | { 15 | "task_type": "classification", 16 | "strategy_class": LeastConfidenceStrategy, 17 | "strategy_kwargs": {}, 18 | }, 19 | { 20 | "task_type": "classification", 21 | "strategy_class": MarginalConfidenceStrategy, 22 | "strategy_kwargs": {}, 23 | }, 24 | { 25 | "task_type": "classification", 26 | "strategy_class": RatioConfidenceStrategy, 27 | "strategy_kwargs": {}, 28 | }, 29 | ] 30 | -------------------------------------------------------------------------------- /tests/strategies/_regression_strategy_test_cases.py: -------------------------------------------------------------------------------- 1 | from pyrelational.strategies.regression import ( 2 | BALDStrategy, 3 | ExpectedImprovementStrategy, 4 | GreedyStrategy, 5 | SoftBALDStrategy, 6 | ThompsonSamplingStrategy, 7 | UpperConfidenceBoundStrategy, 8 | VarianceReductionStrategy, 9 | ) 10 | 11 | REGRESSION_TEST_CASES = [ 12 | {"task_type": "regression", "strategy_class": BALDStrategy, "strategy_kwargs": {}}, 13 | {"task_type": "regression", "strategy_class": VarianceReductionStrategy, "strategy_kwargs": {}}, 14 | { 15 | "task_type": "regression", 16 | "strategy_class": ExpectedImprovementStrategy, 17 | "strategy_kwargs": {}, 18 | }, 19 | { 20 | "task_type": "regression", 21 | "strategy_class": GreedyStrategy, 22 | "strategy_kwargs": {}, 23 | }, 24 | { 25 | "task_type": "regression", 26 | "strategy_class": SoftBALDStrategy, 27 | "strategy_kwargs": {}, 28 | }, 29 | { 30 | "task_type": "regression", 31 | "strategy_class": ThompsonSamplingStrategy, 32 | "strategy_kwargs": {}, 33 | }, 34 | { 35 | "task_type": "regression", 36 | "strategy_class": UpperConfidenceBoundStrategy, 37 | "strategy_kwargs": {"kappa": 0.42}, 38 | }, 39 | ] 40 | -------------------------------------------------------------------------------- /tests/strategies/test_strategies.py: -------------------------------------------------------------------------------- 1 | """Unit tests for strategies.""" "" 2 | from typing import Any, Dict, Type 3 | from unittest import TestCase 4 | 5 | from parameterized import parameterized_class 6 | 7 | from pyrelational.model_managers.mcdropout_model_manager import ( 8 | LightningMCDropoutModelManager, 9 | ) 10 | from pyrelational.strategies import Strategy 11 | from tests.strategies._agnostic_strategy_test_cases import TASK_AGNOSTIC_TEST_CASES 12 | from tests.strategies._classification_strategy_test_cases import ( 13 | CLASSIFICATION_TEST_CASES, 14 | ) 15 | from tests.strategies._regression_strategy_test_cases import REGRESSION_TEST_CASES 16 | from tests.test_utils import ( 17 | BreastCancerClassifier, 18 | DiabetesRegressionModel, 19 | get_classification_dataset, 20 | get_regression_dataset, 21 | ) 22 | 23 | 24 | @parameterized_class(TASK_AGNOSTIC_TEST_CASES + CLASSIFICATION_TEST_CASES + REGRESSION_TEST_CASES) 25 | class TestStrategies(TestCase): 26 | """Class containing unit test of strategies.""" 27 | 28 | task_type: str 29 | strategy_class: Type[Strategy] 30 | strategy_kwargs: Dict[str, Any] 31 | 32 | def setUp(self) -> None: 33 | """Define model and datamanager.""" 34 | if self.task_type == "regression": 35 | model_class = DiabetesRegressionModel 36 | self.datamanager = get_regression_dataset() 37 | else: 38 | model_class = BreastCancerClassifier 39 | self.datamanager = get_classification_dataset() 40 | self.model_manager = LightningMCDropoutModelManager( 41 | model_class, 42 | {"ensemble_size": 3}, 43 | {"epochs": 5}, 44 | ) 45 | self.strategy = self.strategy_class(**self.strategy_kwargs) 46 | 47 | def test_suggest(self) -> None: 48 | """Test suggest return the required number of sample indices.""" 49 | out = self.strategy.suggest(num_annotate=5, model_manager=self.model_manager, data_manager=self.datamanager) 50 | self.assertEqual(len(out), 5) 51 | 52 | def test_str_print(self) -> None: 53 | """Check str returns expected string.""" 54 | self.assertEqual(str(self.strategy), f"Strategy: {self.strategy.__class__.__name__}") 55 | --------------------------------------------------------------------------------