├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── code_coverage.yaml
    │   ├── docs.yml
    │   ├── linter.yml
    │   ├── pypi_upload.yml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmarking
    ├── README.md
    ├── airfoil
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── benchmarking_utils.py
    ├── breastcancer
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── checkerboard2x2
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── checkerboard4x4
    │   ├── data_manager.py
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── classification_experiment_utils.py
    ├── concrete
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── creditcardfraud
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── diabetes
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── digitdataset
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── energy
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── fashionmnist
    │   ├── README.md
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── gaussianclouds
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── glass
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── mnist
    │   ├── data_manager.py
    │   ├── model.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── parkinsons
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── power
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── regression_experiment_utils.py
    ├── results_grid_analysis_ray_local.ipynb
    ├── seeds
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── striatum
    │   ├── data_manager.py
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── synthclass1
    │   ├── data_manager.py
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── synthclass2
    │   ├── data_manager.py
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── synthclass3
    │   ├── data_manager.py
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── synthreg1
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── synthreg2
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    ├── wine
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
    └── yacht
    │   ├── data_manager.py
    │   ├── results.csv
    │   ├── run.py
    │   └── visualisation.ipynb
├── default_configs
    └── pyl_trainer_base_config.json
├── docs
    ├── Makefile
    ├── images
    │   └── pyrelational_overview.png
    ├── make.bat
    └── source
    │   ├── _static
    │       ├── data_indices_diagram.png
    │       └── theme.css
    │   ├── conf.py
    │   ├── index.rst
    │   ├── notes
    │       ├── activelearning.rst
    │       ├── al_pipeline.png
    │       ├── benchmark_datasets.rst
    │       ├── eval.png
    │       ├── installation.rst
    │       ├── performance_comparison.png
    │       ├── quick_start.rst
    │       ├── training.png
    │       ├── using_the_model_api.rst
    │       ├── using_your_own_data.rst
    │       └── using_your_own_strategy.rst
    │   └── reference
    │       ├── data.rst
    │       ├── datasets.rst
    │       ├── informativeness.rst
    │       ├── models.rst
    │       ├── oracles.rst
    │       ├── pipeline.rst
    │       └── strategies.rst
├── examples
    ├── demo
    │   ├── ensemble_uncertainty_classification.py
    │   ├── lightning_diversity_classification.py
    │   ├── lightning_diversity_regression.py
    │   ├── lightning_mixed_regression.py
    │   ├── lightning_representative_classification.py
    │   ├── mcdropout_uncertainty_classification.py
    │   ├── mcdropout_uncertainty_regression.py
    │   ├── model_badge.py
    │   ├── model_gaussianprocesses.py
    │   ├── scikit_estimator.py
    │   └── utils
    │   │   ├── datasets.py
    │   │   └── ml_models.py
    └── notebooks
    │   ├── gaussian_processes.ipynb
    │   └── introduction.ipynb
├── pyproject.toml
├── pyrelational
    ├── __init__.py
    ├── batch_mode_samplers
    │   ├── __init__.py
    │   └── _batch_mode_samplers.py
    ├── data_managers
    │   ├── __init__.py
    │   └── data_manager.py
    ├── datasets
    │   ├── __init__.py
    │   ├── base.py
    │   ├── benchmark_datamanager.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   ├── andrea_et_al.py
    │   │   ├── fashion_mnist.py
    │   │   ├── ksenia_et_al.py
    │   │   ├── mnist.py
    │   │   ├── scikit_learn.py
    │   │   ├── synthetic.py
    │   │   ├── uci.py
    │   │   └── utils.py
    │   ├── download_utils.py
    │   ├── drugcomb.py
    │   ├── regression
    │   │   ├── __init__.py
    │   │   ├── scikit_learn.py
    │   │   ├── synthetic.py
    │   │   ├── uci.py
    │   │   └── utils.py
    │   └── uci_datasets.py
    ├── informativeness
    │   ├── __init__.py
    │   ├── abstract_scorers.py
    │   ├── classification_scorers.py
    │   ├── decorators.py
    │   ├── regression_scorers.py
    │   └── task_agnostic_scorers.py
    ├── model_managers
    │   ├── __init__.py
    │   ├── abstract_model_manager.py
    │   ├── ensemble_model_manager.py
    │   ├── lightning_model_manager.py
    │   ├── mcdropout_model_manager.py
    │   └── model_utils.py
    ├── oracles
    │   ├── __init__.py
    │   ├── abstract_oracle.py
    │   └── benchmark_oracle.py
    ├── pipeline
    │   ├── __init__.py
    │   └── pipeline.py
    ├── strategies
    │   ├── __init__.py
    │   ├── abstract_strategy.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   ├── classification_strategy.py
    │   │   ├── entropy_classification_strategy.py
    │   │   ├── least_confidence_strategy.py
    │   │   ├── marginal_confidence_strategy.py
    │   │   └── ratio_confidence_strategy.py
    │   ├── regression
    │   │   ├── __init__.py
    │   │   ├── bald_strategy.py
    │   │   ├── expected_improvement_strategy.py
    │   │   ├── greedy_strategy.py
    │   │   ├── regression_strategy.py
    │   │   ├── thompson_sampling_strategy.py
    │   │   ├── upper_confidence_bound_strategy.py
    │   │   └── variance_reduction_strategy.py
    │   └── task_agnostic
    │   │   ├── __init__.py
    │   │   ├── random_acquisition_strategy.py
    │   │   ├── relative_distance_strategy.py
    │   │   └── representative_sampling_strategy.py
    ├── types.py
    └── version.py
├── requirements
    ├── base_requirements.txt
    ├── dev_requirements.txt
    └── doc_requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── data_managers
        ├── __init__.py
        └── test_data_manager.py
    ├── datasets
        ├── __init__.py
        ├── test_benchmark_datamanager.py
        ├── test_classification_datasets.py
        ├── test_regression_datasets.py
        └── test_uci_datasets.py
    ├── informativeness
        ├── __init__.py
        └── test_informativeness_scores.py
    ├── model_managers
        ├── __init__.py
        ├── test_ensemble_model_manager.py
        ├── test_mc_dropout_model_manager.py
        └── test_model_managers.py
    ├── oracles
        ├── __init__.py
        └── test_oracles.py
    ├── pipeline
        ├── __init__.py
        └── test_pipeline.py
    ├── samplers
        └── test_samplers.py
    ├── strategies
        ├── __init__.py
        ├── _agnostic_strategy_test_cases.py
        ├── _classification_strategy_test_cases.py
        ├── _regression_strategy_test_cases.py
        └── test_strategies.py
    └── test_utils.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | select = C,E,F,W,B,B950,C950,E950,F950,W950,D,D950,I,I950,N,N950,O,O950,R,R950,S,S950,T,T950,U,U950,V,V950,X,X950,Z,Z950
4 | extend-ignore = D100 D104 E203 E701 W503 F401
5 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behaviour:
15 | 1. Operating system:
16 | 2. Python version:
17 | 3. pip freeze file
18 | 4. Run config and sample file.
19 | 
20 | **Expected behaviour**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | 
24 | **Additional context**
25 | Add any other context about the problem here.
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | **For the title of this PR:** please follow the grammatical rules of a usual publication title, without capitalisation (except for the first letter).
 2 | 
 3 | The title should NOT CONTAIN CODE: no dots, no parentheses, no backticks, no brackets, etc. It needs to be distinctive (not detailed) and succinct (not lengthy).
 4 | Details of this PR will go in the description. **For the description of this PR:** please replace every line in curly brackets ( { like this } ) with an appropriate description following the guidance.
 5 | 
 6 | Finally, **please remove this paragraph**.
 7 | 
 8 | ## What is the goal of this PR?
 9 | 
10 | { In the form of a paragraph (only use bullet points if strictly necessary), please describe the goal of this PR, why they are valuable to achieve, and reference the related GitHub issues. This section will be automatically compiled into the release notes, so please:
11 |   - describe the impact of the change in this PR to the _user_ of this repository (e.g. end user, contributor, developer).
12 |   - describe the new product behaviour in _present tense_, and the old behaviour and how it's been changed in _past tense_.
13 |   - Use the _Royal We_: _"We"_ made changes, not _"I"_ made changes. }
14 | 
15 | ## What are the changes implemented in this PR?
16 | 
17 | { Please explain what you implemented, why your changes are the best way to achieve the goal(s) above. Please describe every method, class and package, by explaining:
18 |   - its responsibility,
19 |   - how it's expected to behave, and
20 |   - how it relates to the adjacent methods/classes/packages it interacts with.
21 | 
22 | This would allow the reviewer to understand your intentions in the code much better. If you're adding new classes, make sure these explanations are also included in the class header comments. Last but not least, please reference the GitHub issues to be automatically closed, such like 'closes #number'. }
23 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Basic dependabot.yml file with
 2 | # minimum configuration for two package managers
 3 | 
 4 | version: 2
 5 | updates:
 6 |   # Enable version updates for python
 7 |   - package-ecosystem: "pip"
 8 |     # Look for requirements files in the `requirements` directory
 9 |     directory: "requirements/"
10 |     # Check for updates once a week
11 |     schedule:
12 |       interval: "daily"
13 |     # Labels on pull requests for version updates only
14 |     labels:
15 |       - "ci"
16 |     pull-request-branch-name:
17 |       # Separate sections of the branch name with a hyphen
18 |       # for example, `dependabot-npm_and_yarn-next_js-acorn-6.4.1`
19 |       separator: "-"
20 |     # Allow up to 5 open pull requests for pip dependencies
21 |     open-pull-requests-limit: 5
22 | 


--------------------------------------------------------------------------------
/.github/workflows/code_coverage.yaml:
--------------------------------------------------------------------------------
 1 | name: Pytest Coverage
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   live-test:
 9 |     name: Test
10 |     runs-on: ubuntu-latest
11 |     permissions: write-all
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v2
15 |       - uses: actions/setup-python@v2
16 |         with:
17 |           python-version: "3.10"
18 |       - name: Install basic dependencies
19 |         run: |
20 |           pip install --upgrade pip==21.3.0
21 |           pip install pytest-cov
22 |           pip install -r requirements/dev_requirements.txt
23 |       - name: Build coverage file
24 |         run: |
25 |           python -m pytest --junitxml=pytest.xml --cov=pyrelational tests/ | tee pytest-coverage.txt
26 | 
27 |       - name: Pytest coverage comment
28 |         id: coverageComment
29 |         uses: MishaKav/pytest-coverage-comment@main
30 |         with:
31 |           pytest-coverage-path: pytest-coverage.txt
32 |           junitxml-path: pytest.xml
33 | 
34 |       - name: Check the output coverage
35 |         run: |
36 |           echo "Coverage Percantage - ${{ steps.coverageComment.outputs.coverage }}"
37 |           echo "Coverage Color - ${{ steps.coverageComment.outputs.color }}"
38 |           echo "Coverage Html - ${{ steps.coverageComment.outputs.coverageHtml }}"
39 |           echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}"
40 |           echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}"
41 |           echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}"
42 |           echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}"
43 |           echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}"
44 |           echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}"
45 |           echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}"
46 | 
47 |       - name: Create the Badge
48 |         uses: schneegans/dynamic-badges-action@v1.0.0
49 |         with:
50 |           auth: ${{ secrets.GIST_TOKEN }}
51 |           gistID: 99eba16a0a4fad7eadf98ef938afe38c
52 |           filename: pytest-coverage-comment.json
53 |           label: Test Coverage
54 |           message: ${{ steps.coverageComment.outputs.coverage }}
55 |           color: ${{ steps.coverageComment.outputs.color }}
56 |           namedLogo: python
57 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 | 
11 |   make_html:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - name: Set up Python 3.10
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: "3.10"
20 |       - name: Install main package
21 |         run: |
22 |          pip install -e .
23 |       - name: Install internal dependencies
24 |         run: |
25 |           pip install sphinx
26 |           pip install sphinx_rtd_theme
27 |       - name: Build documentation
28 |         run: |
29 |           cd docs && make clean && make html
30 | 


--------------------------------------------------------------------------------
/.github/workflows/linter.yml:
--------------------------------------------------------------------------------
 1 | name: lint-task
 2 | # Run this workflow every time a new commit pushed to your repository
 3 | on:
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 | 
 8 |   # Set the job key. The key is displayed as the job name
 9 |   # when a job name is not provided
10 |   run-lint-test:
11 |     runs-on: ubuntu-latest
12 |     name: lint
13 |     # Name the Job
14 |     steps:
15 |       - name: Checkout code
16 |         uses: actions/checkout@v2
17 |       - uses: actions/setup-python@v2
18 |         with:
19 |           python-version: "3.10"
20 |       - name: Install flake8 and plugins
21 |         run: |
22 |           pip install --upgrade pip==22.2
23 |           python3 -m venv env
24 |           source env/bin/activate
25 |           pip install -r requirements/dev_requirements.txt
26 |       - name: Run linter
27 |         run: |
28 |           source env/bin/activate
29 |           flake8 --exclude env
30 |           black . --exclude env --check 2>&1 >/dev/null
31 |       - name: Run mypy
32 |         run: |
33 |           source env/bin/activate
34 |           mypy pyrelational/ --allow-redefinition --disable-error-code import --disable-error-code no-untyped-call --disable-error-code no-redef --implicit-reexport --strict --install-types --non-interactive --ignore-missing-imports --follow-imports=silent
35 |       - name: clean venv
36 |         run: |
37 |           rm -r env
38 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi_upload.yml:
--------------------------------------------------------------------------------
 1 | name: Upload to PIP
 2 | 
 3 | # Controls when the action will run.
 4 | on:
 5 |   # Triggers the workflow when a release is created
 6 |   release:
 7 |     types: [created]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
13 | jobs:
14 |   # This workflow contains a single job called "upload"
15 |   upload:
16 |     # The type of runner that the job will run on
17 |     runs-on: ubuntu-latest
18 | 
19 |     # Steps represent a sequence of tasks that will be executed as part of the job
20 |     steps:
21 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
22 |       - uses: actions/checkout@v2
23 | 
24 |       # Sets up python
25 |       - uses: actions/setup-python@v2
26 |         with:
27 |           python-version: 3.8
28 | 
29 |       # Install dependencies
30 |       - name: "Installs dependencies"
31 |         run: |
32 |           python3 -m pip install --upgrade pip
33 |           python3 -m pip install setuptools wheel twine
34 | 
35 |       # Build and upload to PyPI
36 |       - name: "Builds and uploads to PyPI"
37 |         run: |
38 |           python3 setup.py sdist bdist_wheel
39 |           python3 -m twine upload dist/*
40 |         env:
41 |           TWINE_USERNAME: __token__
42 |           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
43 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | # Run this workflow every time a new commit pushed to your repository
 4 | on:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 | 
 9 |   # Set the job key. The key is displayed as the job name
10 |   # when a job name is not provided
11 |   run-tests:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.9", "3.10", "3.11", "3.12"]
16 | 
17 |     name: Tests
18 |     # Name the Job
19 |     steps:
20 |       - name: Checkout code
21 |         uses: actions/checkout@v2
22 |       - uses: actions/setup-python@v2
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Install basic dependencies
26 |         run: |
27 |           pip install --upgrade pip
28 |           pip install -r requirements/dev_requirements.txt
29 |           pip install pytest-cov
30 |       - name: Test with pytest
31 |         run: |
32 |           python -m pytest --cache-clear --cov=pyrelational tests > pytest-coverage.txt
33 |       - name: Print error
34 |         if: failure()
35 |         run: |
36 |           cat pytest-coverage.txt
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | data/
  2 | .idea
  3 | .vscode
  4 | .DS_Store
  5 | 
  6 | run_experiments.sh
  7 | 
  8 | # Dev files
  9 | deprecated/
 10 | examples/demo/experiment_logs/
 11 | experiment_logs/
 12 | test_data/
 13 | 
 14 | **/ray_benchmark_results/
 15 | 
 16 | # Checkpoints
 17 | checkpoints/
 18 | 
 19 | # Ignoring MLRun generated files
 20 | mlruns
 21 | 
 22 | # Byte-compiled / optimized / DLL files
 23 | __pycache__/
 24 | *.py[cod]
 25 | *$py.class
 26 | 
 27 | # C extensions
 28 | *.so
 29 | 
 30 | # Distribution / packaging
 31 | .Python
 32 | build/
 33 | develop-eggs/
 34 | dist/
 35 | downloads/
 36 | eggs/
 37 | .eggs/
 38 | lib/
 39 | lib64/
 40 | parts/
 41 | sdist/
 42 | var/
 43 | wheels/
 44 | pip-wheel-metadata/
 45 | share/python-wheels/
 46 | *.egg-info/
 47 | .installed.cfg
 48 | *.egg
 49 | MANIFEST
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .nox/
 65 | .coverage
 66 | .coverage.*
 67 | .cache
 68 | nosetests.xml
 69 | coverage.xml
 70 | *.cover
 71 | *.py,cover
 72 | .hypothesis/
 73 | .pytest_cache/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Django stuff:
 80 | *.log
 81 | local_settings.py
 82 | db.sqlite3
 83 | db.sqlite3-journal
 84 | 
 85 | # Flask stuff:
 86 | instance/
 87 | .webassets-cache
 88 | 
 89 | # Scrapy stuff:
 90 | .scrapy
 91 | 
 92 | # Sphinx documentation
 93 | docs/_build/
 94 | docs/build/
 95 | 
 96 | # PyBuilder
 97 | target/
 98 | 
 99 | # Jupyter Notebook
100 | .ipynb_checkpoints
101 | 
102 | # IPython
103 | profile_default/
104 | ipython_config.py
105 | 
106 | # pyenv
107 | .python-version
108 | 
109 | # pipenv
110 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
112 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
113 | #   install all needed dependencies.
114 | #Pipfile.lock
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # Lightning logs
154 | **/lightning_logs
155 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.1.0  # Keep the version for general utility hooks
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: end-of-file-fixer
 7 |         exclude: examples/notebooks/
 8 |       - id: trailing-whitespace
 9 |   - repo: local
10 |     hooks:
11 |       - id: black
12 |         name: black
13 |         entry: black
14 |         language: system
15 |         types: [python]
16 |   - repo: local
17 |     hooks:
18 |       - id: flake8
19 |         name: flake8
20 |         entry: flake8
21 |         language: system
22 |         types: [python]
23 |         files: ^pyrelational
24 |   - repo: local
25 |     hooks:
26 |       - id: isort
27 |         name: isort (python)
28 |         entry: isort --profile=black
29 |         language: system
30 |         types: [python]
31 |   - repo: local
32 |     hooks:
33 |       - id: mypy
34 |         name: mypy
35 |         entry: mypy
36 |         language: system
37 |         types: [python]
38 |         exclude: |
39 |             (?x)(
40 |                 ^tests/ |
41 |                 ^examples/ |
42 |                 ^docs/
43 |             )
44 |         args: [
45 |             --strict,
46 |             --follow-imports=silent,
47 |             --ignore-missing-imports,
48 |             --allow-redefinition,
49 |             --install-types,
50 |             --non-interactive,
51 |             --implicit-reexport,
52 |             --allow-untyped-calls,
53 |             --disable-error-code=no-redef,
54 |         ]
55 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.10"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/source/conf.py
17 | 
18 | # Install requirements
19 | python:
20 |    install:
21 |    - requirements: requirements/doc_requirements.txt
22 | 


--------------------------------------------------------------------------------
/benchmarking/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarking overview
 2 | 
 3 | The benchmark results are located within a subdirectory for each dataset in `results.csv`. These results are generated by a python script in the same folder, currently named `run.py`. Each run script is styled in a similar pattern containing a `trial` function which specifies the manner in which the PyRelationAL pipeline is constructed. Done so as we may want different model_managers, oracles, seeds, etc. to be run. It also defines settings for names, and resources to be used for the trials.
 4 | 
 5 | For each experiment, the DataManager is defined in a separate script, specifying the train, val, test splits along with the initial labelled and unlabelled indices in the queryable (train) pool.
 6 | 
 7 | In current benchmarks, we leverage [Ray Tune's job scheduling](https://docs.ray.io/en/latest/tune/index.html) to efficiently distribute and manage the running of the jobs across available hardware. Each job creates an individual "trial" with its results and other logs sent to the `results.csv`. After this, we can collate and analyze the trials at the trial or whole experiment level by reading these results files in the `visualisation.ipynb` notebook.
 8 | 
 9 | # TL;DR
10 | 
11 | To re-run a benchmark experiment with the same selection of strategies as in the paper: run the following command from the project repository
12 | ```bash
13 | python -m benchmarking.<dataset>.run
14 | ```
15 | 
16 | - If adding a strategy to an existing benchmark: adjust `run.py`, then run it with `python -m benchmarking/<dataset>/run`.
17 | - If adding a new dataset: add a `<dataset>` folder under `benchmarking/` then add the necessary scripts underneath following examples on the repository. Most important in this case is the `data_manager.py` which will specify the train, validation, test, initial labelled, and initial unlabelled indices for the dataset.
18 | 
19 | 
20 | # Utilities
21 | We provide some genetic benchmarking utils in `benchmarking_utils.py` along with classification and regression specific utilities and model definitions in `classification_experiment_utils.py` and `regression_experiment_utils.py` respectively.
22 | 
23 | - `benchmarking_utils.py`: Contains general utilities for processing the outputs of the ray benchmarks.
24 | - `classification_experiment_utils.py`: Provides utilities and model definitions specific to classification tasks, including data preprocessing, model training, and evaluation functions. It also contains utilities for quickly calling classification specific AL strategies and parameter spaces for the experiments. It may be useful to add to these when trying new strategies.
25 | - `regression_experiment_utils.py`: Offers utilities and model definitions tailored for regression tasks, covering data handling, model training, and performance evaluation. It also contains utilities for quickly calling regression specific AL strategies and parameter spaces for the experiments. It may be useful to add to these when trying new strategies.
26 | 


--------------------------------------------------------------------------------
/benchmarking/airfoil/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the airfoil dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIAirfoil
12 | 
13 | 
14 | def get_airfoil_data_manager() -> DataManager:
15 |     ds = UCIAirfoil()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [1000, 100, 402])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/airfoil/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..regression_experiment_utils import (
17 |     GPR,
18 |     EnsembleScikit,
19 |     experiment_param_space,
20 |     get_strategy_from_string,
21 |     numpy_collate,
22 | )
23 | from .data_manager import get_airfoil_data_manager
24 | 
25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26 | 
27 | 
28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
29 |     seed = config["seed"]
30 |     set_all_seeds(seed)
31 |     strategy = get_strategy_from_string(config["strategy"])
32 |     data_manager = get_airfoil_data_manager()
33 |     model_config: Dict[str, Any] = {}
34 |     trainer_config: Dict[str, Any] = {}
35 |     model_manager: GPR = GPR(model_config, trainer_config)
36 |     oracle = BenchmarkOracle()
37 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
38 | 
39 |     # Annotating data step by step until the trainset is fully annotated
40 |     pipeline.run(num_annotate=1)
41 |     print(pipeline)
42 | 
43 |     iteration_metrics = []
44 |     for i in range(len(pipeline.performances)):
45 |         if "test_metric" in pipeline.performances[i]:
46 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
47 | 
48 |     iteration_metrics = np.array(iteration_metrics)
49 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
50 | 
51 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     EXPERIMENT_NAME = "results"
56 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
57 | 
58 |     trial = tune.with_resources(trial, {"cpu": 2})
59 |     tuner = tune.Tuner(
60 |         trial,
61 |         tune_config=tune.TuneConfig(num_samples=1),
62 |         param_space=experiment_param_space,
63 |         run_config=RunConfig(
64 |             name=EXPERIMENT_NAME,
65 |             storage_path=STORAGE_PATH,
66 |         ),
67 |     )
68 |     results_grid = tuner.fit()
69 |     results_df = process_results_grid(results_grid=results_grid)
70 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
71 | 


--------------------------------------------------------------------------------
/benchmarking/breastcancer/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Breast Cancer dataset
 4 | """
 5 | 
 6 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 7 | 
 8 | import numpy as np
 9 | import torch
10 | from numpy.typing import NDArray
11 | 
12 | from pyrelational.data_managers import DataManager
13 | from pyrelational.datasets.classification.scikit_learn import BreastCancerDataset
14 | 
15 | from ..classification_experiment_utils import (
16 |     make_class_stratified_train_val_test_split,
17 |     pick_one_sample_per_class,
18 | )
19 | 
20 | 
21 | def get_breastcancer_data_manager() -> DataManager:
22 |     ds = BreastCancerDataset()
23 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
24 | 
25 |     return DataManager(
26 |         ds,
27 |         train_indices=train_indices,
28 |         validation_indices=valid_indices,
29 |         test_indices=test_indices,
30 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
31 |         loader_batch_size="full",
32 |         loader_collate_fn=numpy_collate,
33 |     )
34 | 
35 | 
36 | def numpy_collate(
37 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
38 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
39 |     """Collate function for a Pytorch to Numpy DataLoader"""
40 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
41 | 


--------------------------------------------------------------------------------
/benchmarking/breastcancer/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_breastcancer_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_breastcancer_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 2})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/checkerboard2x2/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Checkerboard2x2 dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.ksenia_et_al import Checkerboard2x2Dataset
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_checkerboard2x2_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 20))
26 |     ds = Checkerboard2x2Dataset()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 | 
29 |     return DataManager(
30 |         ds,
31 |         train_indices=train_indices,
32 |         validation_indices=valid_indices,
33 |         test_indices=test_indices,
34 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
35 |         loader_batch_size="full",
36 |         loader_collate_fn=numpy_collate,
37 |     )
38 | 
39 | 
40 | def numpy_collate(
41 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
42 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
43 |     """Collate function for a Pytorch to Numpy DataLoader"""
44 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
45 | 


--------------------------------------------------------------------------------
/benchmarking/checkerboard2x2/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.metrics import auc
12 | from sklearn.neighbors import KNeighborsClassifier
13 | from sklearn.neural_network import MLPClassifier
14 | from sklearn.svm import SVC
15 | 
16 | from pyrelational.oracles import BenchmarkOracle
17 | from pyrelational.pipeline import Pipeline
18 | 
19 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
20 | from ..classification_experiment_utils import (
21 |     SKRFC,
22 |     LogisticRegressor,
23 |     experiment_param_space,
24 |     get_strategy_from_string,
25 | )
26 | from .data_manager import get_checkerboard2x2_data_manager
27 | 
28 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
29 | 
30 | 
31 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
32 |     seed = config["seed"]
33 |     set_all_seeds(seed)
34 |     strategy = get_strategy_from_string(config["strategy"])
35 |     data_manager = get_checkerboard2x2_data_manager()
36 |     model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3}
37 |     model_config = {}
38 |     trainer_config: Dict[str, Any] = {}
39 |     model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config)
40 |     oracle = BenchmarkOracle()
41 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
42 | 
43 |     # Annotating data step by step until the trainset is fully annotated
44 |     pipeline.run(num_annotate=1, num_iterations=150)
45 |     print(pipeline)
46 | 
47 |     iteration_metrics = []
48 |     for i in range(len(pipeline.performances)):
49 |         if "test_metric" in pipeline.performances[i]:
50 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
51 | 
52 |     iteration_metrics = np.array(iteration_metrics)
53 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
54 | 
55 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     EXPERIMENT_NAME = "results"
60 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
61 | 
62 |     trial = tune.with_resources(trial, {"cpu": 4})
63 |     tuner = tune.Tuner(
64 |         trial,
65 |         tune_config=tune.TuneConfig(num_samples=1),
66 |         param_space=experiment_param_space,
67 |         run_config=RunConfig(
68 |             name=EXPERIMENT_NAME,
69 |             storage_path=STORAGE_PATH,
70 |         ),
71 |     )
72 |     results_grid = tuner.fit()
73 |     results_df = process_results_grid(results_grid=results_grid)
74 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
75 | 


--------------------------------------------------------------------------------
/benchmarking/checkerboard4x4/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Checkerboard4x4 dataset
 4 | """
 5 | 
 6 | import random
 7 | import time
 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | from numpy.typing import NDArray
13 | 
14 | from pyrelational.data_managers import DataManager
15 | from pyrelational.datasets.classification.ksenia_et_al import Checkerboard4x4Dataset
16 | 
17 | from ..classification_experiment_utils import (
18 |     make_class_stratified_train_val_test_split,
19 |     pick_one_sample_per_class,
20 | )
21 | 
22 | 
23 | def get_checkerboard4x4_data_manager() -> DataManager:
24 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
25 |     # when creating the DataManager
26 |     time.sleep(random.randint(1, 20))
27 | 
28 |     ds = Checkerboard4x4Dataset()
29 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
30 | 
31 |     return DataManager(
32 |         ds,
33 |         train_indices=train_indices,
34 |         validation_indices=valid_indices,
35 |         test_indices=test_indices,
36 |         # FIXME
37 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
38 |         loader_batch_size="full",
39 |         loader_collate_fn=numpy_collate,
40 |     )
41 | 
42 | 
43 | def numpy_collate(
44 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
45 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
46 |     """Collate function for a Pytorch to Numpy DataLoader"""
47 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
48 | 


--------------------------------------------------------------------------------
/benchmarking/concrete/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the Concrete dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIConcrete
12 | 
13 | 
14 | def get_concrete_data_manager() -> DataManager:
15 |     ds = UCIConcrete()
16 |     print(len(ds))
17 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [800, 30, 200])
18 |     train_indices = list(train_ds.indices)
19 |     valid_indices = list(valid_ds.indices)
20 |     test_indices = list(test_ds.indices)
21 | 
22 |     return DataManager(
23 |         ds,
24 |         train_indices=train_indices,
25 |         validation_indices=valid_indices,
26 |         test_indices=test_indices,
27 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
28 |         loader_batch_size="full",
29 |         loader_collate_fn=numpy_collate,
30 |     )
31 | 
32 | 
33 | def numpy_collate(
34 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
36 |     """Collate function for a Pytorch to Numpy DataLoader"""
37 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
38 | 


--------------------------------------------------------------------------------
/benchmarking/concrete/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..regression_experiment_utils import (
17 |     EnsembleScikit,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 |     numpy_collate,
21 | )
22 | from .data_manager import get_concrete_data_manager
23 | 
24 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
25 | 
26 | 
27 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
28 |     seed = config["seed"]
29 |     set_all_seeds(seed)
30 |     strategy = get_strategy_from_string(config["strategy"])
31 |     data_manager = get_concrete_data_manager()
32 |     model_config: Dict[str, Any] = {"random_state": seed}
33 |     trainer_config: Dict[str, Any] = {}
34 |     model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config)
35 |     oracle = BenchmarkOracle()
36 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
37 | 
38 |     # Annotating data step by step until the trainset is fully annotated
39 |     pipeline.run(num_annotate=1)
40 |     print(pipeline)
41 | 
42 |     iteration_metrics = []
43 |     for i in range(len(pipeline.performances)):
44 |         if "test_metric" in pipeline.performances[i]:
45 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
46 | 
47 |     iteration_metrics = np.array(iteration_metrics)
48 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
49 | 
50 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     EXPERIMENT_NAME = "results"
55 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
56 | 
57 |     trial = tune.with_resources(trial, {"cpu": 2})
58 |     tuner = tune.Tuner(
59 |         trial,
60 |         tune_config=tune.TuneConfig(num_samples=1),
61 |         param_space=experiment_param_space,
62 |         run_config=RunConfig(
63 |             name=EXPERIMENT_NAME,
64 |             storage_path=STORAGE_PATH,
65 |         ),
66 |     )
67 |     results_grid = tuner.fit()
68 |     results_df = process_results_grid(results_grid=results_grid)
69 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
70 | 


--------------------------------------------------------------------------------
/benchmarking/creditcardfraud/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the creditcardfraud dataset
 4 | """
 5 | 
 6 | import random
 7 | import time
 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | from numpy.typing import NDArray
13 | 
14 | from pyrelational.data_managers import DataManager
15 | from pyrelational.datasets.classification.andrea_et_al import CreditCardDataset
16 | 
17 | from ..classification_experiment_utils import (
18 |     make_class_stratified_train_val_test_split,
19 |     pick_one_sample_per_class,
20 | )
21 | 
22 | 
23 | def get_creditcard_data_manager() -> DataManager:
24 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
25 |     # when creating the DataManager
26 |     time.sleep(random.randint(1, 50))
27 | 
28 |     ds = CreditCardDataset()
29 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
30 | 
31 |     return DataManager(
32 |         ds,
33 |         train_indices=train_indices,
34 |         validation_indices=valid_indices,
35 |         test_indices=test_indices,
36 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
37 |         loader_batch_size="full",
38 |         loader_collate_fn=numpy_collate,
39 |     )
40 | 
41 | 
42 | def numpy_collate(
43 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
44 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
45 |     """Collate function for a Pytorch to Numpy DataLoader"""
46 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
47 | 


--------------------------------------------------------------------------------
/benchmarking/creditcardfraud/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_creditcard_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_creditcard_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1, num_iterations=250)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 2})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/diabetes/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the diabetes dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.scikit_learn import DiabetesDataset
12 | 
13 | 
14 | def get_diabetes_data_manager() -> DataManager:
15 |     ds = DiabetesDataset()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [300, 42, 100])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 30, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/diabetes/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | from sklearn.neural_network import MLPRegressor
12 | 
13 | from pyrelational.oracles import BenchmarkOracle
14 | from pyrelational.pipeline import Pipeline
15 | 
16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
17 | from ..regression_experiment_utils import (
18 |     EnsembleScikit,
19 |     experiment_param_space,
20 |     get_strategy_from_string,
21 |     numpy_collate,
22 | )
23 | from .data_manager import get_diabetes_data_manager
24 | 
25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26 | 
27 | 
28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
29 |     seed = config["seed"]
30 |     set_all_seeds(seed)
31 |     strategy = get_strategy_from_string(config["strategy"])
32 |     data_manager = get_diabetes_data_manager()
33 |     model_config: Dict[str, Any] = {"random_state": seed}
34 |     trainer_config: Dict[str, Any] = {}
35 |     model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config)
36 |     oracle = BenchmarkOracle()
37 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
38 | 
39 |     # Annotating data step by step until the trainset is fully annotated
40 |     pipeline.run(num_annotate=1)
41 |     print(pipeline)
42 | 
43 |     iteration_metrics = []
44 |     for i in range(len(pipeline.performances)):
45 |         if "test_metric" in pipeline.performances[i]:
46 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
47 | 
48 |     iteration_metrics = np.array(iteration_metrics)
49 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
50 | 
51 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     EXPERIMENT_NAME = "results"
56 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
57 | 
58 |     trial = tune.with_resources(trial, {"cpu": 2})
59 |     tuner = tune.Tuner(
60 |         trial,
61 |         tune_config=tune.TuneConfig(num_samples=1),
62 |         param_space=experiment_param_space,
63 |         run_config=RunConfig(
64 |             name=EXPERIMENT_NAME,
65 |             storage_path=STORAGE_PATH,
66 |         ),
67 |     )
68 |     results_grid = tuner.fit()
69 |     results_df = process_results_grid(results_grid=results_grid)
70 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
71 | 


--------------------------------------------------------------------------------
/benchmarking/digitdataset/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Digit dataset in Sklearn
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.scikit_learn import DigitDataset
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_digitdataset_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = DigitDataset()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 |     return DataManager(
29 |         ds,
30 |         train_indices=train_indices,
31 |         validation_indices=valid_indices,
32 |         test_indices=test_indices,
33 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
34 |         loader_batch_size="full",
35 |         loader_collate_fn=numpy_collate,
36 |     )
37 | 
38 | 
39 | def numpy_collate(
40 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
42 |     """Collate function for a Pytorch to Numpy DataLoader"""
43 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
44 | 


--------------------------------------------------------------------------------
/benchmarking/digitdataset/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_digitdataset_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_digitdataset_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/energy/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the energy dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIEnergy
12 | 
13 | 
14 | def get_energy_data_manager() -> DataManager:
15 |     ds = UCIEnergy()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 100, 268])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/energy/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.metrics import auc
10 | 
11 | from pyrelational.oracles import BenchmarkOracle
12 | from pyrelational.pipeline import Pipeline
13 | 
14 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
15 | from ..regression_experiment_utils import (
16 |     GPR,
17 |     experiment_param_space,
18 |     get_strategy_from_string,
19 |     numpy_collate,
20 | )
21 | from .data_manager import get_energy_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_energy_data_manager()
31 |     model_config: Dict[str, Any] = {}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager: GPR = GPR(model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 2})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/fashionmnist/README.md:
--------------------------------------------------------------------------------
1 | The benchmark script may need to be run twice as errors might arise the first time when downloading the dataset and race conditions beginning with ray's attempt to download the dataset multiple times in parallel.
2 | 


--------------------------------------------------------------------------------
/benchmarking/fashionmnist/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Zalando FashionMNIST dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.fashion_mnist import FashionMNIST
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_fashionMnist_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = FashionMNIST()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 |     return DataManager(
29 |         ds,
30 |         train_indices=train_indices,
31 |         validation_indices=valid_indices,
32 |         test_indices=test_indices,
33 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
34 |         loader_batch_size="full",
35 |         loader_collate_fn=numpy_collate,
36 |     )
37 | 
38 | 
39 | def numpy_collate(
40 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
42 |     """Collate function for a Pytorch to Numpy DataLoader"""
43 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
44 | 


--------------------------------------------------------------------------------
/benchmarking/fashionmnist/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_fashionMnist_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_fashionMnist_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=10, num_iterations=250)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/gaussianclouds/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the GaussianClouds dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.ksenia_et_al import GaussianCloudsDataset
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_gaussianclouds_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = GaussianCloudsDataset()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 | 
29 |     return DataManager(
30 |         ds,
31 |         train_indices=train_indices,
32 |         validation_indices=valid_indices,
33 |         test_indices=test_indices,
34 |         # FIXME
35 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
36 |         loader_batch_size="full",
37 |         loader_collate_fn=numpy_collate,
38 |     )
39 | 
40 | 
41 | def numpy_collate(
42 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
44 |     """Collate function for a Pytorch to Numpy DataLoader"""
45 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
46 | 


--------------------------------------------------------------------------------
/benchmarking/gaussianclouds/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_gaussianclouds_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_gaussianclouds_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1, num_iterations=250)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/glass/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the glass dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.uci import UCIGlass
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_glass_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = UCIGlass()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 |     return DataManager(
29 |         ds,
30 |         train_indices=train_indices,
31 |         validation_indices=valid_indices,
32 |         test_indices=test_indices,
33 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
34 |         loader_batch_size="full",
35 |         loader_collate_fn=numpy_collate,
36 |     )
37 | 
38 | 
39 | def numpy_collate(
40 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
42 |     """Collate function for a Pytorch to Numpy DataLoader"""
43 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
44 | 


--------------------------------------------------------------------------------
/benchmarking/glass/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_glass_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_glass_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/mnist/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Data manager for MNIST dataset.
 2 | 
 3 | We follow the setup in the BatchBald paper: https://arxiv.org/abs/1906.08158.
 4 | """
 5 | 
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | from pyrelational.data_managers import DataManager
 9 | from pyrelational.datasets.classification import MNIST
10 | 
11 | 
12 | def get_mnist_datamanager(
13 |     percentage_val: float = 0.1,
14 |     labelled_size: int = 20,
15 |     random_state: int = 42,
16 |     data_dir: str = "/tmp/",
17 | ) -> DataManager:
18 |     """Instantiate data manager for MNIST dataset.
19 | 
20 |     :param percentage_val: size in percentage of the validation split, defaults to 0.1
21 |     :param labelled_size: number of initial labelled sample, defaults to 20
22 |     :param random_state: random seed, defaults to 42
23 |     :param data_dir: directory where to download the data, defaults to "/tmp/"
24 |     :return: MNIST pyrelational data manager.
25 |     """
26 |     dataset = MNIST(data_dir=data_dir)
27 |     train_ixs, test_ixs = dataset.data_splits[0]
28 | 
29 |     unlabelled_ixs, val_ixs = train_test_split(
30 |         train_ixs,
31 |         test_size=percentage_val,
32 |         random_state=random_state,
33 |         stratify=dataset.y[train_ixs],
34 |     )
35 | 
36 |     labelled_ixs, unlabelled_ixs = train_test_split(
37 |         unlabelled_ixs,
38 |         train_size=labelled_size,
39 |         random_state=random_state,
40 |         stratify=dataset.y[unlabelled_ixs],
41 |     )
42 | 
43 |     data_manager = DataManager(
44 |         dataset=dataset,
45 |         labelled_indices=labelled_ixs.tolist(),
46 |         unlabelled_indices=unlabelled_ixs.tolist(),
47 |         validation_indices=val_ixs.tolist(),
48 |         test_indices=test_ixs.tolist(),
49 |     )
50 |     return data_manager
51 | 


--------------------------------------------------------------------------------
/benchmarking/parkinsons/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Parkinsons dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.uci import UCIParkinsons
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_parkinsons_data_manager() -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = UCIParkinsons()
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 |     return DataManager(
29 |         ds,
30 |         train_indices=train_indices,
31 |         validation_indices=valid_indices,
32 |         test_indices=test_indices,
33 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
34 |         loader_batch_size="full",
35 |         loader_collate_fn=numpy_collate,
36 |     )
37 | 
38 | 
39 | def numpy_collate(
40 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
41 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
42 |     """Collate function for a Pytorch to Numpy DataLoader"""
43 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
44 | 


--------------------------------------------------------------------------------
/benchmarking/parkinsons/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_parkinsons_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_parkinsons_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/power/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the Power dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIPower
12 | 
13 | 
14 | def get_power_data_manager() -> DataManager:
15 |     ds = UCIPower()
16 |     print(len(ds))
17 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [7900, 100, 1568])
18 |     train_indices = list(train_ds.indices)
19 |     valid_indices = list(valid_ds.indices)
20 |     test_indices = list(test_ds.indices)
21 | 
22 |     return DataManager(
23 |         ds,
24 |         train_indices=train_indices,
25 |         validation_indices=valid_indices,
26 |         test_indices=test_indices,
27 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
28 |         loader_batch_size="full",
29 |         loader_collate_fn=numpy_collate,
30 |     )
31 | 
32 | 
33 | def numpy_collate(
34 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
36 |     """Collate function for a Pytorch to Numpy DataLoader"""
37 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
38 | 


--------------------------------------------------------------------------------
/benchmarking/power/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | import random
 4 | import time
 5 | from typing import Any, Dict, Union
 6 | 
 7 | import numpy as np
 8 | from numpy.typing import NDArray
 9 | from ray import tune
10 | from ray.train import RunConfig
11 | from sklearn.linear_model import ElasticNet
12 | from sklearn.metrics import auc
13 | 
14 | from pyrelational.oracles import BenchmarkOracle
15 | from pyrelational.pipeline import Pipeline
16 | 
17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
18 | from ..regression_experiment_utils import (
19 |     EnsembleScikit,
20 |     experiment_param_space,
21 |     get_strategy_from_string,
22 |     numpy_collate,
23 | )
24 | from .data_manager import get_power_data_manager
25 | 
26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
27 | 
28 | 
29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
30 |     time.sleep(random.uniform(40, 120))
31 |     seed = config["seed"]
32 |     set_all_seeds(seed)
33 |     strategy = get_strategy_from_string(config["strategy"])
34 |     data_manager = get_power_data_manager()
35 |     model_config: Dict[str, Any] = {"random_state": seed}
36 |     trainer_config: Dict[str, Any] = {}
37 |     model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config)
38 |     oracle = BenchmarkOracle()
39 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
40 | 
41 |     # Annotating data step by step until the trainset is fully annotated
42 |     pipeline.run(num_annotate=1, num_iterations=500)
43 |     print(pipeline)
44 | 
45 |     iteration_metrics = []
46 |     for i in range(len(pipeline.performances)):
47 |         if "test_metric" in pipeline.performances[i]:
48 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
49 | 
50 |     iteration_metrics = np.array(iteration_metrics)
51 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
52 | 
53 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     EXPERIMENT_NAME = "results"
58 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
59 | 
60 |     trial = tune.with_resources(trial, {"cpu": 2})
61 |     tuner = tune.Tuner(
62 |         trial,
63 |         tune_config=tune.TuneConfig(num_samples=1),
64 |         param_space=experiment_param_space,
65 |         run_config=RunConfig(
66 |             name=EXPERIMENT_NAME,
67 |             storage_path=STORAGE_PATH,
68 |         ),
69 |     )
70 |     results_grid = tuner.fit()
71 |     results_df = process_results_grid(results_grid=results_grid)
72 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
73 | 


--------------------------------------------------------------------------------
/benchmarking/seeds/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the seeds dataset
 4 | """
 5 | 
 6 | import random
 7 | import time
 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | from numpy.typing import NDArray
13 | 
14 | from pyrelational.data_managers import DataManager
15 | from pyrelational.datasets.classification.uci import UCISeeds
16 | 
17 | from ..classification_experiment_utils import (
18 |     make_class_stratified_train_val_test_split,
19 |     pick_one_sample_per_class,
20 | )
21 | 
22 | 
23 | def get_seeds_data_manager() -> DataManager:
24 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
25 |     # when creating the DataManager
26 |     time.sleep(random.randint(1, 10))
27 |     ds = UCISeeds()
28 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
29 |     return DataManager(
30 |         ds,
31 |         train_indices=train_indices,
32 |         validation_indices=valid_indices,
33 |         test_indices=test_indices,
34 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
35 |         loader_batch_size="full",
36 |         loader_collate_fn=numpy_collate,
37 |     )
38 | 
39 | 
40 | def numpy_collate(
41 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
42 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
43 |     """Collate function for a Pytorch to Numpy DataLoader"""
44 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
45 | 


--------------------------------------------------------------------------------
/benchmarking/seeds/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_seeds_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_seeds_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=1)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 2})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/striatum/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the Striatum dataset
 4 | """
 5 | 
 6 | import random
 7 | import time
 8 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 9 | 
10 | import numpy as np
11 | import torch
12 | from numpy.typing import NDArray
13 | 
14 | from pyrelational.data_managers import DataManager
15 | from pyrelational.datasets.classification.ksenia_et_al import StriatumDataset
16 | 
17 | from ..classification_experiment_utils import pick_one_sample_per_class
18 | 
19 | 
20 | def get_stratium_data_manager() -> DataManager:
21 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
22 |     # when creating the DataManager
23 |     time.sleep(random.randint(15, 50))
24 | 
25 |     ds = StriatumDataset()
26 | 
27 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [9900, 100, 10000])
28 |     train_indices = list(train_ds.indices)
29 |     valid_indices = list(valid_ds.indices)
30 |     test_indices = list(test_ds.indices)
31 | 
32 |     return DataManager(
33 |         ds,
34 |         train_indices=train_indices,
35 |         validation_indices=valid_indices,
36 |         test_indices=test_indices,
37 |         # FIXME
38 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
39 |         loader_batch_size="full",
40 |         loader_collate_fn=numpy_collate,
41 |     )
42 | 
43 | 
44 | def numpy_collate(
45 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
46 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
47 |     """Collate function for a Pytorch to Numpy DataLoader"""
48 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
49 | 


--------------------------------------------------------------------------------
/benchmarking/striatum/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..classification_experiment_utils import (
17 |     SKRFC,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 | )
21 | from .data_manager import get_stratium_data_manager
22 | 
23 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
24 | 
25 | 
26 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
27 |     seed = config["seed"]
28 |     set_all_seeds(seed)
29 |     strategy = get_strategy_from_string(config["strategy"])
30 |     data_manager = get_stratium_data_manager()
31 |     model_config = {"n_estimators": 10, "bootstrap": True}
32 |     trainer_config: Dict[str, Any] = {}
33 |     model_manager = SKRFC(RandomForestClassifier, model_config, trainer_config)
34 |     oracle = BenchmarkOracle()
35 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
36 | 
37 |     # Annotating data step by step until the trainset is fully annotated
38 |     pipeline.run(num_annotate=10, num_iterations=250)
39 |     print(pipeline)
40 | 
41 |     iteration_metrics = []
42 |     for i in range(len(pipeline.performances)):
43 |         if "test_metric" in pipeline.performances[i]:
44 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
45 | 
46 |     iteration_metrics = np.array(iteration_metrics)
47 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
48 | 
49 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     EXPERIMENT_NAME = "results"
54 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
55 | 
56 |     trial = tune.with_resources(trial, {"cpu": 4})
57 |     tuner = tune.Tuner(
58 |         trial,
59 |         tune_config=tune.TuneConfig(num_samples=1),
60 |         param_space=experiment_param_space,
61 |         run_config=RunConfig(
62 |             name=EXPERIMENT_NAME,
63 |             storage_path=STORAGE_PATH,
64 |         ),
65 |     )
66 |     results_grid = tuner.fit()
67 |     results_df = process_results_grid(results_grid=results_grid)
68 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
69 | 


--------------------------------------------------------------------------------
/benchmarking/synthclass1/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the SynthClass1 dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.synthetic import SynthClass1
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_synthclass1_data_manager(seed: int) -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = SynthClass1(random_seed=seed)
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 | 
29 |     return DataManager(
30 |         ds,
31 |         train_indices=train_indices,
32 |         validation_indices=valid_indices,
33 |         test_indices=test_indices,
34 |         # FIXME
35 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
36 |         loader_batch_size="full",
37 |         loader_collate_fn=numpy_collate,
38 |     )
39 | 
40 | 
41 | def numpy_collate(
42 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
44 |     """Collate function for a Pytorch to Numpy DataLoader"""
45 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
46 | 


--------------------------------------------------------------------------------
/benchmarking/synthclass1/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.metrics import auc
12 | from sklearn.neural_network import MLPClassifier
13 | 
14 | from pyrelational.oracles import BenchmarkOracle
15 | from pyrelational.pipeline import Pipeline
16 | 
17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
18 | from ..classification_experiment_utils import (
19 |     SKRFC,
20 |     LogisticRegressor,
21 |     experiment_param_space,
22 |     get_strategy_from_string,
23 | )
24 | from .data_manager import get_synthclass1_data_manager
25 | 
26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
27 | 
28 | 
29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
30 |     seed = config["seed"]
31 |     set_all_seeds(seed)
32 |     strategy = get_strategy_from_string(config["strategy"])
33 |     data_manager = get_synthclass1_data_manager(seed=seed)
34 |     model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3}
35 |     model_config = {"random_state": seed, "hidden_layer_sizes": (128, 64), "early_stopping": True}
36 |     trainer_config: Dict[str, Any] = {}
37 |     model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config)
38 |     oracle = BenchmarkOracle()
39 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
40 | 
41 |     # Annotating data step by step until the trainset is fully annotated
42 |     pipeline.run(num_annotate=1)
43 |     print(pipeline)
44 | 
45 |     iteration_metrics = []
46 |     for i in range(len(pipeline.performances)):
47 |         if "test_metric" in pipeline.performances[i]:
48 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
49 | 
50 |     iteration_metrics = np.array(iteration_metrics)
51 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
52 | 
53 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     EXPERIMENT_NAME = "results"
58 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
59 | 
60 |     trial = tune.with_resources(trial, {"cpu": 4})
61 |     tuner = tune.Tuner(
62 |         trial,
63 |         tune_config=tune.TuneConfig(num_samples=1),
64 |         param_space=experiment_param_space,
65 |         run_config=RunConfig(
66 |             name=EXPERIMENT_NAME,
67 |             storage_path=STORAGE_PATH,
68 |         ),
69 |     )
70 |     results_grid = tuner.fit()
71 |     results_df = process_results_grid(results_grid=results_grid)
72 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
73 | 


--------------------------------------------------------------------------------
/benchmarking/synthclass2/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the SynthClass2 dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.synthetic import SynthClass2
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_synthclass2_data_manager(seed: int) -> DataManager:
23 |     # Add a random wait between 1 and 10 seconds to avoid race conditions
24 |     # when creating the DataManager
25 |     time.sleep(random.randint(1, 10))
26 |     ds = SynthClass2(random_seed=seed)
27 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
28 | 
29 |     return DataManager(
30 |         ds,
31 |         train_indices=train_indices,
32 |         validation_indices=valid_indices,
33 |         test_indices=test_indices,
34 |         # FIXME
35 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
36 |         loader_batch_size="full",
37 |         loader_collate_fn=numpy_collate,
38 |     )
39 | 
40 | 
41 | def numpy_collate(
42 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
43 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
44 |     """Collate function for a Pytorch to Numpy DataLoader"""
45 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
46 | 


--------------------------------------------------------------------------------
/benchmarking/synthclass2/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.ensemble import RandomForestClassifier
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.metrics import auc
12 | from sklearn.neural_network import MLPClassifier
13 | 
14 | from pyrelational.oracles import BenchmarkOracle
15 | from pyrelational.pipeline import Pipeline
16 | 
17 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
18 | from ..classification_experiment_utils import (
19 |     SKRFC,
20 |     LogisticRegressor,
21 |     experiment_param_space,
22 |     get_strategy_from_string,
23 | )
24 | from .data_manager import get_synthclass2_data_manager
25 | 
26 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
27 | 
28 | 
29 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[np.float32, np.float64]]]]:
30 |     seed = config["seed"]
31 |     set_all_seeds(seed)
32 |     strategy = get_strategy_from_string(config["strategy"])
33 |     data_manager = get_synthclass2_data_manager(seed=seed)
34 |     model_config = {"n_estimators": 3, "bootstrap": True, "max_depth": 3}
35 |     model_config = {"random_state": seed, "hidden_layer_sizes": (128, 64), "early_stopping": True}
36 |     trainer_config: Dict[str, Any] = {}
37 |     model_manager = LogisticRegressor(MLPClassifier, model_config, trainer_config)
38 |     oracle = BenchmarkOracle()
39 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
40 | 
41 |     # Annotating data step by step until the trainset is fully annotated
42 |     pipeline.run(num_annotate=1)
43 |     print(pipeline)
44 | 
45 |     iteration_metrics = []
46 |     for i in range(len(pipeline.performances)):
47 |         if "test_metric" in pipeline.performances[i]:
48 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
49 | 
50 |     iteration_metrics = np.array(iteration_metrics)
51 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
52 | 
53 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     EXPERIMENT_NAME = "results"
58 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
59 | 
60 |     trial = tune.with_resources(trial, {"cpu": 4})
61 |     tuner = tune.Tuner(
62 |         trial,
63 |         tune_config=tune.TuneConfig(num_samples=1),
64 |         param_space=experiment_param_space,
65 |         run_config=RunConfig(
66 |             name=EXPERIMENT_NAME,
67 |             storage_path=STORAGE_PATH,
68 |         ),
69 |     )
70 |     results_grid = tuner.fit()
71 |     results_df = process_results_grid(results_grid=results_grid)
72 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
73 | 


--------------------------------------------------------------------------------
/benchmarking/synthclass3/data_manager.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | """Benchmarking DataManager for the SynthClass3 dataset
 4 | """
 5 | import random
 6 | import time
 7 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 8 | 
 9 | import numpy as np
10 | import torch
11 | from numpy.typing import NDArray
12 | 
13 | from pyrelational.data_managers import DataManager
14 | from pyrelational.datasets.classification.synthetic import SynthClass3
15 | 
16 | from ..classification_experiment_utils import (
17 |     make_class_stratified_train_val_test_split,
18 |     pick_one_sample_per_class,
19 | )
20 | 
21 | 
22 | def get_synthclass3_data_manager(seed: int) -> DataManager:
23 |     ds = SynthClass3(random_seed=seed)
24 |     train_indices, valid_indices, test_indices = make_class_stratified_train_val_test_split(ds, k=5)
25 | 
26 |     return DataManager(
27 |         ds,
28 |         train_indices=train_indices,
29 |         validation_indices=valid_indices,
30 |         test_indices=test_indices,
31 |         # FIXME
32 |         labelled_indices=pick_one_sample_per_class(ds, train_indices),
33 |         loader_batch_size="full",
34 |         loader_collate_fn=numpy_collate,
35 |     )
36 | 
37 | 
38 | def numpy_collate(
39 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
40 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
41 |     """Collate function for a Pytorch to Numpy DataLoader"""
42 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
43 | 


--------------------------------------------------------------------------------
/benchmarking/synthreg1/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the SynthReg1 dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.synthetic import SynthReg1
12 | 
13 | 
14 | def get_synthreg1_data_manager() -> DataManager:
15 |     ds = SynthReg1()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 50, 550])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/synthreg1/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | from sklearn.neural_network import MLPRegressor
12 | 
13 | from pyrelational.oracles import BenchmarkOracle
14 | from pyrelational.pipeline import Pipeline
15 | 
16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
17 | from ..regression_experiment_utils import (
18 |     EnsembleScikit,
19 |     experiment_param_space,
20 |     get_strategy_from_string,
21 |     numpy_collate,
22 | )
23 | from .data_manager import get_synthreg1_data_manager
24 | 
25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26 | 
27 | 
28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
29 |     seed = config["seed"]
30 |     set_all_seeds(seed)
31 |     strategy = get_strategy_from_string(config["strategy"])
32 |     data_manager = get_synthreg1_data_manager()
33 |     model_config: Dict[str, Any] = {
34 |         "random_state": seed,
35 |         "max_iter": 1000,
36 |         "hidden_layer_sizes": (32, 8, 4),
37 |         "learning_rate_init": 3e-4,
38 |     }
39 |     trainer_config: Dict[str, Any] = {}
40 |     model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config)
41 |     oracle = BenchmarkOracle()
42 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
43 | 
44 |     # Annotating data step by step until the trainset is fully annotated
45 |     pipeline.run(num_annotate=1, num_iterations=200)
46 |     print(pipeline)
47 | 
48 |     iteration_metrics = []
49 |     for i in range(len(pipeline.performances)):
50 |         if "test_metric" in pipeline.performances[i]:
51 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
52 | 
53 |     iteration_metrics = np.array(iteration_metrics)
54 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
55 | 
56 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     EXPERIMENT_NAME = "results"
61 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
62 | 
63 |     trial = tune.with_resources(trial, {"cpu": 2})
64 |     tuner = tune.Tuner(
65 |         trial,
66 |         tune_config=tune.TuneConfig(num_samples=1),
67 |         param_space=experiment_param_space,
68 |         run_config=RunConfig(
69 |             name=EXPERIMENT_NAME,
70 |             storage_path=STORAGE_PATH,
71 |         ),
72 |     )
73 |     results_grid = tuner.fit()
74 |     results_df = process_results_grid(results_grid=results_grid)
75 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
76 | 


--------------------------------------------------------------------------------
/benchmarking/synthreg2/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the SynthReg2 dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.synthetic import SynthReg2
12 | 
13 | 
14 | def get_synthreg2_data_manager() -> DataManager:
15 |     ds = SynthReg2()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [400, 50, 550])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/synthreg2/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | from sklearn.neural_network import MLPRegressor
12 | 
13 | from pyrelational.oracles import BenchmarkOracle
14 | from pyrelational.pipeline import Pipeline
15 | 
16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
17 | from ..regression_experiment_utils import (
18 |     EnsembleScikit,
19 |     experiment_param_space,
20 |     get_strategy_from_string,
21 |     numpy_collate,
22 | )
23 | from .data_manager import get_synthreg2_data_manager
24 | 
25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26 | 
27 | 
28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
29 |     seed = config["seed"]
30 |     set_all_seeds(seed)
31 |     strategy = get_strategy_from_string(config["strategy"])
32 |     data_manager = get_synthreg2_data_manager()
33 |     model_config: Dict[str, Any] = {
34 |         "random_state": seed,
35 |         "max_iter": 1000,
36 |         "hidden_layer_sizes": (32, 8, 4),
37 |         "learning_rate_init": 3e-4,
38 |     }
39 |     trainer_config: Dict[str, Any] = {}
40 |     model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config)
41 |     oracle = BenchmarkOracle()
42 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
43 | 
44 |     # Annotating data step by step until the trainset is fully annotated
45 |     pipeline.run(num_annotate=1, num_iterations=200)
46 |     print(pipeline)
47 | 
48 |     iteration_metrics = []
49 |     for i in range(len(pipeline.performances)):
50 |         if "test_metric" in pipeline.performances[i]:
51 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
52 | 
53 |     iteration_metrics = np.array(iteration_metrics)
54 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
55 | 
56 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     EXPERIMENT_NAME = "results"
61 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
62 | 
63 |     trial = tune.with_resources(trial, {"cpu": 2})
64 |     tuner = tune.Tuner(
65 |         trial,
66 |         tune_config=tune.TuneConfig(num_samples=1),
67 |         param_space=experiment_param_space,
68 |         run_config=RunConfig(
69 |             name=EXPERIMENT_NAME,
70 |             storage_path=STORAGE_PATH,
71 |         ),
72 |     )
73 |     results_grid = tuner.fit()
74 |     results_df = process_results_grid(results_grid=results_grid)
75 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
76 | 


--------------------------------------------------------------------------------
/benchmarking/wine/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the wine dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIWine
12 | 
13 | 
14 | def get_wine_data_manager() -> DataManager:
15 |     ds = UCIWine()
16 |     print(len(ds))
17 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [1000, 100, 498])
18 |     train_indices = list(train_ds.indices)
19 |     valid_indices = list(valid_ds.indices)
20 |     test_indices = list(test_ds.indices)
21 | 
22 |     return DataManager(
23 |         ds,
24 |         train_indices=train_indices,
25 |         validation_indices=valid_indices,
26 |         test_indices=test_indices,
27 |         labelled_indices=np.random.choice(train_indices, 1, replace=False).tolist(),
28 |         loader_batch_size="full",
29 |         loader_collate_fn=numpy_collate,
30 |     )
31 | 
32 | 
33 | def numpy_collate(
34 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
35 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
36 |     """Collate function for a Pytorch to Numpy DataLoader"""
37 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
38 | 


--------------------------------------------------------------------------------
/benchmarking/wine/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | 
12 | from pyrelational.oracles import BenchmarkOracle
13 | from pyrelational.pipeline import Pipeline
14 | 
15 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
16 | from ..regression_experiment_utils import (
17 |     EnsembleScikit,
18 |     experiment_param_space,
19 |     get_strategy_from_string,
20 |     numpy_collate,
21 | )
22 | from .data_manager import get_wine_data_manager
23 | 
24 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
25 | 
26 | 
27 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
28 |     seed = config["seed"]
29 |     set_all_seeds(seed)
30 |     strategy = get_strategy_from_string(config["strategy"])
31 |     data_manager = get_wine_data_manager()
32 |     model_config: Dict[str, Any] = {"random_state": seed}
33 |     trainer_config: Dict[str, Any] = {}
34 |     model_manager: EnsembleScikit = EnsembleScikit(ElasticNet, 5, model_config, trainer_config)
35 |     oracle = BenchmarkOracle()
36 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
37 | 
38 |     # Annotating data step by step until the trainset is fully annotated
39 |     pipeline.run(num_annotate=1, num_iterations=200)
40 |     print(pipeline)
41 | 
42 |     iteration_metrics = []
43 |     for i in range(len(pipeline.performances)):
44 |         if "test_metric" in pipeline.performances[i]:
45 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
46 | 
47 |     iteration_metrics = np.array(iteration_metrics)
48 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
49 | 
50 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     EXPERIMENT_NAME = "results"
55 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
56 | 
57 |     trial = tune.with_resources(trial, {"cpu": 2})
58 |     tuner = tune.Tuner(
59 |         trial,
60 |         tune_config=tune.TuneConfig(num_samples=1),
61 |         param_space=experiment_param_space,
62 |         run_config=RunConfig(
63 |             name=EXPERIMENT_NAME,
64 |             storage_path=STORAGE_PATH,
65 |         ),
66 |     )
67 |     results_grid = tuner.fit()
68 |     results_df = process_results_grid(results_grid=results_grid)
69 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
70 | 


--------------------------------------------------------------------------------
/benchmarking/yacht/data_manager.py:
--------------------------------------------------------------------------------
 1 | """Benchmarking DataManager for the Yacht dataset
 2 | """
 3 | 
 4 | from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from numpy.typing import NDArray
 9 | 
10 | from pyrelational.data_managers import DataManager
11 | from pyrelational.datasets.regression.uci import UCIYacht
12 | 
13 | 
14 | def get_yacht_data_manager() -> DataManager:
15 |     ds = UCIYacht()
16 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [180, 20, 106])
17 |     train_indices = list(train_ds.indices)
18 |     valid_indices = list(valid_ds.indices)
19 |     test_indices = list(test_ds.indices)
20 | 
21 |     return DataManager(
22 |         ds,
23 |         train_indices=train_indices,
24 |         validation_indices=valid_indices,
25 |         test_indices=test_indices,
26 |         labelled_indices=np.random.choice(train_indices, 20, replace=False).tolist(),
27 |         loader_batch_size="full",
28 |         loader_collate_fn=numpy_collate,
29 |     )
30 | 
31 | 
32 | def numpy_collate(
33 |     batch: List[Union[torch.Tensor, NDArray[Union[Any, np.float32, np.float64]]]]
34 | ) -> List[NDArray[Union[Any, np.float32, np.float64]]]:
35 |     """Collate function for a Pytorch to Numpy DataLoader"""
36 |     return [np.stack([b.numpy() if isinstance(b, torch.Tensor) else b for b in samples]) for samples in zip(*batch)]
37 | 


--------------------------------------------------------------------------------
/benchmarking/yacht/run.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | import os
 3 | from typing import Any, Dict, Union
 4 | 
 5 | import numpy as np
 6 | from numpy.typing import NDArray
 7 | from ray import tune
 8 | from ray.train import RunConfig
 9 | from sklearn.linear_model import ElasticNet
10 | from sklearn.metrics import auc
11 | from sklearn.neural_network import MLPRegressor
12 | 
13 | from pyrelational.oracles import BenchmarkOracle
14 | from pyrelational.pipeline import Pipeline
15 | 
16 | from ..benchmarking_utils import process_results_grid, save_results_df, set_all_seeds
17 | from ..regression_experiment_utils import (
18 |     EnsembleScikit,
19 |     experiment_param_space,
20 |     get_strategy_from_string,
21 |     numpy_collate,
22 | )
23 | from .data_manager import get_yacht_data_manager
24 | 
25 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
26 | 
27 | 
28 | def trial(config: Dict[str, Any]) -> Dict[str, Union[float, NDArray[Union[Any, np.float32, np.float64]]]]:
29 |     seed = config["seed"]
30 |     set_all_seeds(seed)
31 |     strategy = get_strategy_from_string(config["strategy"])
32 |     data_manager = get_yacht_data_manager()
33 |     model_config: Dict[str, Any] = {
34 |         "random_state": seed,
35 |         "max_iter": 1000,
36 |         "hidden_layer_sizes": (32, 8, 4),
37 |         "early_stopping": True,
38 |         "learning_rate_init": 3e-4,
39 |     }
40 |     trainer_config: Dict[str, Any] = {}
41 |     model_manager: EnsembleScikit = EnsembleScikit(MLPRegressor, 10, model_config, trainer_config)
42 |     oracle = BenchmarkOracle()
43 |     pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
44 | 
45 |     # Annotating data step by step until the trainset is fully annotated
46 |     pipeline.run(num_annotate=1)
47 |     print(pipeline)
48 | 
49 |     iteration_metrics = []
50 |     for i in range(len(pipeline.performances)):
51 |         if "test_metric" in pipeline.performances[i]:
52 |             iteration_metrics.append(pipeline.performances[i]["test_metric"])
53 | 
54 |     iteration_metrics = np.array(iteration_metrics)
55 |     score_area_under_curve = auc(np.arange(len(iteration_metrics)), iteration_metrics)
56 | 
57 |     return {"score": score_area_under_curve, "iteration_metrics": iteration_metrics}
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     EXPERIMENT_NAME = "results"
62 |     STORAGE_PATH = os.path.join(os.getcwd(), "ray_benchmark_results")
63 | 
64 |     trial = tune.with_resources(trial, {"cpu": 2})
65 |     tuner = tune.Tuner(
66 |         trial,
67 |         tune_config=tune.TuneConfig(num_samples=1),
68 |         param_space=experiment_param_space,
69 |         run_config=RunConfig(
70 |             name=EXPERIMENT_NAME,
71 |             storage_path=STORAGE_PATH,
72 |         ),
73 |     )
74 |     results_grid = tuner.fit()
75 |     results_df = process_results_grid(results_grid=results_grid)
76 |     save_results_df(results_df=results_df, storage_path=SCRIPT_DIR, experiment_name=EXPERIMENT_NAME)
77 | 


--------------------------------------------------------------------------------
/default_configs/pyl_trainer_base_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "epochs": 100,
 3 |   "period_eval": 1,
 4 |   "checkpoints_dir": "experiment_logs/",
 5 |   "checkpoints_name": "run",
 6 |   "monitor_metric_name": "loss",
 7 |   "monitor_metric_mode": "min",
 8 |   "use_early_stopping": false,
 9 |   "patience": 100,
10 |   "save_top_k": 1
11 | }
12 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SPHINXPROJ    = PyRelationAL
 9 | SOURCEDIR     = source
10 | BUILDDIR      = build
11 | 
12 | # Put it first so that "make" without argument is like "make help".
13 | help:
14 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
15 | 
16 | .PHONY: help Makefile
17 | 
18 | # Catch-all target: route all unknown targets to Sphinx using the new
19 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
20 | %: Makefile
21 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 


--------------------------------------------------------------------------------
/docs/images/pyrelational_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/images/pyrelational_overview.png


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/_static/data_indices_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/_static/data_indices_diagram.png


--------------------------------------------------------------------------------
/docs/source/_static/theme.css:
--------------------------------------------------------------------------------
1 | .wy-nav-content {
2 |     min-width: 100% !important;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | import datetime
10 | 
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | import os
16 | import sys
17 | 
18 | import sphinx_rtd_theme
19 | 
20 | sys.path.insert(0, os.path.abspath("../.."))
21 | import pyrelational  # noqa: E402
22 | 
23 | # -- Project information -----------------------------------------------------
24 | 
25 | project = "PyRelationAL"
26 | author = "Relation Therapeutics"
27 | copyright = f"{datetime.datetime.now().year}, {author}"
28 | 
29 | # The full version, including alpha/beta/rc tags
30 | release = pyrelational.__version__
31 | 
32 | 
33 | # -- General configuration ---------------------------------------------------
34 | 
35 | # Add any Sphinx extension module names here, as strings. They can be
36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
37 | # ones.
38 | extensions = [
39 |     "sphinx.ext.autodoc",
40 |     "sphinx.ext.napoleon",
41 |     "sphinx.ext.mathjax",
42 |     "sphinx.ext.viewcode",
43 | ]
44 | 
45 | source_suffix = ".rst"
46 | master_doc = "index"
47 | autoclass_content = "both"
48 | add_module_names = False
49 | 
50 | # Add any paths that contain templates here, relative to this directory.
51 | templates_path = ["_templates"]
52 | 
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
57 | 
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = "sphinx_rtd_theme"
65 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
66 | html_css_files = ["theme.css"]
67 | 
68 | 
69 | # Add any paths that contain custom static files (such as style sheets) here,
70 | # relative to this directory. They are copied after the builtin static files,
71 | # so a file named "default.css" will overwrite the builtin "default.css".
72 | html_static_path = ["_static"]
73 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyRelationAL documentation master file, created by
 2 |    sphinx-quickstart on Thu Jun 17 15:33:16 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :github_url: https://github.com/RelationRx/pyrelational
 7 | 
 8 | Welcome to PyRelationAL's documentation!
 9 | ========================================
10 | 
11 | **PyRelationAL** is a python active learning library developed by `Relation Therapeutics <https://www.relationrx.com/>`_ for
12 | rapidly implementing active learning pipelines from data management, model development (and Bayesian approximation), to creating novel active learning strategies.
13 | 
14 | .. toctree::
15 |    :maxdepth: 1
16 |    :caption: Notes
17 | 
18 |    notes/activelearning
19 |    notes/installation
20 |    notes/quick_start
21 |    notes/using_your_own_data
22 |    notes/using_the_model_api
23 |    notes/using_your_own_strategy
24 |    notes/benchmark_datasets
25 | 
26 | .. toctree::
27 |    :glob:
28 |    :maxdepth: 2
29 |    :caption: Package modules
30 | 
31 |    reference/data.rst
32 |    reference/datasets.rst
33 |    reference/models.rst
34 |    reference/informativeness.rst
35 |    reference/strategies.rst
36 |    reference/oracles.rst
37 |    reference/pipeline.rst
38 | 
39 | Indices and tables
40 | ==================
41 | 
42 | * :ref:`genindex`
43 | * :ref:`modindex`
44 | * :ref:`search`
45 | 
46 | 
47 | If the library is useful for your work please consider citing **PyRelationAL**.
48 | 
49 | .. code-block:: latex
50 | 
51 |    @misc{pyrelational,
52 |          title={PyRelationAL},
53 |          author={Relation Therapeutics},
54 |          year={2021},
55 |          publisher = {GitHub}
56 |          journal = {GitHub repository}
57 |          howpublished = {\url{https://github.com/RelationRx/pyrelational}}
58 |    }
59 | 


--------------------------------------------------------------------------------
/docs/source/notes/al_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/al_pipeline.png


--------------------------------------------------------------------------------
/docs/source/notes/eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/eval.png


--------------------------------------------------------------------------------
/docs/source/notes/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | We do not recommend to install our library as a root user on your
 5 | system, but to set up an environment instead, using for example
 6 | `Anaconda <https://conda.io/projects/conda/en/latest/user-guide/install>`__.
 7 | To use the library, **you will need Python 3.8 or newer**.
 8 | 
 9 | Installation via Pip Wheels
10 | ---------------------------
11 | 
12 | You can install the PyRelationAL library directly using pip:
13 | 
14 | ::
15 | 
16 |    pip install pyrelational
17 | 
18 | Installation from Source
19 | ------------------------
20 | 
21 | Alternatively, you can install PyRelationAL directly from source:
22 | 
23 | 1. install the relevant packages
24 | 
25 | ::
26 | 
27 |    pip install numpy>=1.20,
28 |    pip install pandas>=1.3,
29 |    pip install pytorch-lightning>=1.5,
30 |    pip install torch>=1.9.0,
31 |    pip install scikit-learn>=1.0.2,
32 | 
33 | 2. install additional packages to play with our examples:
34 | 
35 | ::
36 | 
37 |    pip install torchvision>=0.10.0
38 |    pip install gpytorch>=1.4
39 | 


--------------------------------------------------------------------------------
/docs/source/notes/performance_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/performance_comparison.png


--------------------------------------------------------------------------------
/docs/source/notes/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/docs/source/notes/training.png


--------------------------------------------------------------------------------
/docs/source/reference/data.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.data_managers
 2 | ===========================
 3 | 
 4 | Data Manager
 5 | --------------------------------------
 6 | 
 7 | .. automodule:: pyrelational.data_managers.data_manager
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/docs/source/reference/datasets.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.datasets
 2 | =====================
 3 | 
 4 | 
 5 | Classification datasets
 6 | -----------------------
 7 | 
 8 | The following classes contain a variety of classic classification datasets that have been used in different active learning papers. Each behaves the same as a PyTorch Dataset.
 9 | 
10 | .. automodule:: pyrelational.datasets.classification
11 |    :members:
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Regression datasets
16 | -----------------------
17 | 
18 | The following classes contain a variety of classic regression datasets that have been used in different active learning papers. Each behaves the same as a PyTorch Dataset.
19 | 
20 | .. automodule:: pyrelational.datasets.regression
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | 
26 | Benchmark DataManager
27 | ---------------------
28 | 
29 | The following functions accept the datasets defined in this package to produce DataManagers containing labelling initialisations that correspond to cold and warm start active learning tasks. These can be used for benchmarking strategies quickly.
30 | 
31 | .. automodule:: pyrelational.datasets.benchmark_datamanager
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 


--------------------------------------------------------------------------------
/docs/source/reference/informativeness.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.informativeness
 2 | ============================
 3 | 
 4 | Informativeness functions for regression tasks
 5 | ----------------------------------------------
 6 | 
 7 | .. automodule:: pyrelational.informativeness.regression
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | Informativeness functions for classification tasks
13 | --------------------------------------------------
14 | 
15 | .. automodule:: pyrelational.informativeness.classification
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | Task agnostic informativeness functions
21 | ----------------------------------------
22 | 
23 | .. automodule:: pyrelational.informativeness.task_agnostic
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 


--------------------------------------------------------------------------------
/docs/source/reference/models.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.model_managers
 2 | ===================
 3 | 
 4 | Abstract Model Manager
 5 | -----------------------------------------
 6 | 
 7 | .. automodule:: pyrelational.model_managers.abstract_model_manager
 8 |    :members:
 9 |    :undoc-members:
10 |    :special-members: __call__
11 |    :show-inheritance:
12 | 
13 | Pytorch Lightning Model
14 | -------------------------------------------
15 | 
16 | .. automodule:: pyrelational.model_managers.lightning_model_manager
17 |    :members:
18 |    :undoc-members:
19 |    :special-members: __call__
20 |    :show-inheritance:
21 | 
22 | Ensemble Models
23 | ------------------------------------------
24 | 
25 | .. automodule:: pyrelational.model_managers.ensemble_model_manager
26 |    :members:
27 |    :undoc-members:
28 |    :special-members: __call__
29 |    :show-inheritance:
30 | 
31 | MCDropout Models
32 | -------------------------------------------
33 | 
34 | .. automodule:: pyrelational.model_managers.mcdropout_model_manager
35 |    :members:
36 |    :undoc-members:
37 |    :special-members: __call__
38 |    :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/source/reference/oracles.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.oracles
 2 | ====================
 3 | 
 4 | Abstract Oracle
 5 | --------------------------------------------
 6 | 
 7 | .. automodule:: pyrelational.oracles.abstract_oracle
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | Benchmark Oracle
13 | ---------------------------------------------
14 | 
15 | .. automodule:: pyrelational.oracles.benchmark_oracle
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 


--------------------------------------------------------------------------------
/docs/source/reference/pipeline.rst:
--------------------------------------------------------------------------------
 1 | pyrelational.pipeline
 2 | =====================
 3 | 
 4 | Pipeline
 5 | -------------------------------------
 6 | 
 7 | .. automodule:: pyrelational.pipeline.pipeline
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 


--------------------------------------------------------------------------------
/examples/demo/ensemble_uncertainty_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a classification
 3 | task with the active learning library
 4 | 
 5 | It illustrates the ensemble method.
 6 | """
 7 | 
 8 | # User imports
 9 | import logging
10 | 
11 | import torch
12 | from torchvision import datasets, transforms
13 | 
14 | # Dataset and machine learning model
15 | from utils.ml_models import MnistClassification
16 | 
17 | # Active Learning package
18 | from pyrelational.data_managers import DataManager
19 | from pyrelational.model_managers import LightningEnsembleModelManager
20 | from pyrelational.oracles import BenchmarkOracle
21 | from pyrelational.pipeline import Pipeline
22 | from pyrelational.strategies.classification import LeastConfidenceStrategy
23 | 
24 | # dataset
25 | dataset = datasets.FashionMNIST(root="data", train=True, download=True, transform=transforms.ToTensor())
26 | 
27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [9000, 26000, 25000])
28 | train_indices = train_ds.indices
29 | val_indices = val_ds.indices
30 | test_indices = test_ds.indices
31 | 
32 | # model manager
33 | model_manager = LightningEnsembleModelManager(
34 |     model_class=MnistClassification, model_config={}, trainer_config={"epochs": 5}, n_estimators=5
35 | )
36 | 
37 | # data_manager and defining strategy
38 | data_manager = DataManager(
39 |     dataset=dataset,
40 |     train_indices=train_indices,
41 |     validation_indices=val_indices,
42 |     test_indices=test_indices,
43 |     loader_batch_size=1000,
44 |     label_attr="targets",
45 | )
46 | 
47 | # Set up active learning pipeline
48 | strategy = LeastConfidenceStrategy()
49 | oracle = BenchmarkOracle()
50 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
51 | 
52 | # Remove lightning prints
53 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
54 | 
55 | # See performance with the full trainset labelled
56 | pipeline.compute_theoretical_performance()
57 | 
58 | # New data to be annotated, followed by an update of the data_manager and model
59 | to_annotate = pipeline.step(num_annotate=1000)
60 | pipeline.query(indices=to_annotate)
61 | 
62 | # Annotating data step by step until the trainset is fully annotated
63 | pipeline.run(num_annotate=1000)
64 | 
65 | # Pretty printed summary of the components in the pipeline along with annotation/performance history
66 | print(pipeline)
67 | 


--------------------------------------------------------------------------------
/examples/demo/lightning_diversity_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a classification
 3 | task with the active learning library
 4 | 
 5 | Here we give an example of defining your own custom AL strategy
 6 | """
 7 | 
 8 | import logging
 9 | 
10 | import torch
11 | 
12 | # Dataset and machine learning model
13 | from utils.datasets import BreastCancerDataset
14 | from utils.ml_models import BreastCancerClassification
15 | 
16 | # Active Learning package
17 | from pyrelational.data_managers import DataManager
18 | from pyrelational.model_managers import LightningModelManager
19 | from pyrelational.oracles import BenchmarkOracle
20 | from pyrelational.pipeline import Pipeline
21 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import (
22 |     RelativeDistanceStrategy,
23 | )
24 | 
25 | # Obtain dataset and set up labelled and unlabelled subsets
26 | dataset = BreastCancerDataset()
27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [500, 30, 39])
28 | train_indices = train_ds.indices
29 | val_indices = val_ds.indices
30 | test_indices = test_ds.indices
31 | 
32 | # Instantiate model_manager
33 | model_manager = LightningModelManager(
34 |     model_class=BreastCancerClassification, model_config={}, trainer_config={"epochs": 4}
35 | )
36 | 
37 | # data_manager and defining strategy
38 | data_manager = DataManager(
39 |     dataset=dataset,
40 |     train_indices=train_indices,
41 |     validation_indices=val_indices,
42 |     test_indices=test_indices,
43 |     hit_ratio_at=5,
44 | )
45 | 
46 | # Setup
47 | strategy = RelativeDistanceStrategy()
48 | oracle = BenchmarkOracle()
49 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
50 | 
51 | # Remove lightning prints
52 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
53 | 
54 | # performance with the full trainset labelled
55 | pipeline.compute_theoretical_performance()
56 | 
57 | # New data to be annotated, followed by an update of the data_manager and model
58 | to_annotate = pipeline.step(num_annotate=100)
59 | pipeline.query(indices=to_annotate)
60 | 
61 | # Annotating data step by step until the trainset is fully annotated
62 | pipeline.run(num_annotate=100)
63 | 
64 | # Pretty printed summary of the components in the pipeline along with annotation/performance history
65 | print(pipeline)
66 | 


--------------------------------------------------------------------------------
/examples/demo/lightning_diversity_regression.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a regression
 3 | task with the active learning library
 4 | 
 5 | Here we give an example of defining your own custom AL strategy
 6 | """
 7 | 
 8 | import logging
 9 | 
10 | import torch
11 | 
12 | # Dataset and machine learning model
13 | from utils.datasets import DiabetesDataset
14 | from utils.ml_models import DiabetesRegression
15 | 
16 | # Active Learning package
17 | from pyrelational.data_managers import DataManager
18 | from pyrelational.model_managers import LightningModelManager
19 | from pyrelational.oracles import BenchmarkOracle
20 | from pyrelational.pipeline import Pipeline
21 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import (
22 |     RelativeDistanceStrategy,
23 | )
24 | 
25 | # dataset
26 | dataset = DiabetesDataset()
27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [400, 22, 20])
28 | train_indices = train_ds.indices
29 | val_indices = val_ds.indices
30 | test_indices = test_ds.indices
31 | 
32 | # model_manager
33 | model_manager = LightningModelManager(model_class=DiabetesRegression, model_config={}, trainer_config={"epochs": 4})
34 | 
35 | # data_manager and defining strategy
36 | data_manager = DataManager(
37 |     dataset=dataset,
38 |     train_indices=train_indices,
39 |     validation_indices=val_indices,
40 |     test_indices=test_indices,
41 |     hit_ratio_at=5,
42 | )
43 | 
44 | # Setup pipeline
45 | strategy = RelativeDistanceStrategy()
46 | oracle = BenchmarkOracle()
47 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
48 | 
49 | 
50 | # Remove lightning prints
51 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
52 | 
53 | # performance with the full trainset labelled
54 | pipeline.compute_theoretical_performance()
55 | 
56 | # New data to be annotated, followed by an update of the data_manager and model
57 | to_annotate = pipeline.step(num_annotate=100)
58 | pipeline.query(indices=to_annotate)
59 | 
60 | # Annotating data step by step until the trainset is fully annotated
61 | pipeline.run(num_annotate=100)
62 | print(pipeline)
63 | 


--------------------------------------------------------------------------------
/examples/demo/lightning_mixed_regression.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TO DO: illustrate with dataset
 3 | """
 4 | 
 5 | import torch
 6 | 
 7 | from pyrelational.batch_mode_samplers import TopKSampler
 8 | from pyrelational.informativeness import StandardDeviation
 9 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
10 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import (
11 |     representative_sampling,
12 | )
13 | 
14 | 
15 | class MixedStrategy(RegressionStrategy):
16 |     """Implements a strategy that combines least_confidence scorer with representative sampling.
17 |     To this end, 10 times more samples than requested are selected based on least_confidence scorer,
18 |     the list is then reduced based on representative_sampling.
19 |     """
20 | 
21 |     def __init__(self, clustering_method: str, oversample_factor: int = 10):
22 |         super().__init__(StandardDeviation(), TopKSampler())
23 |         self.clustering_method = clustering_method
24 |         self.oversample_factor = oversample_factor
25 | 
26 |     def __call__(self, num_annotate, data_manager, model_manager):
27 |         ixs = super().__call__(num_annotate * self.oversample_factor, data_manager, model_manager)
28 |         subquery = torch.stack(data_manager.get_sample_feature_vectors(ixs))
29 |         new_ixs = representative_sampling(subquery, num_annotate, self.clustering_method)
30 |         return [ixs[i] for i in new_ixs]
31 | 


--------------------------------------------------------------------------------
/examples/demo/lightning_representative_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a classification
 3 | task with the active learning library
 4 | 
 5 | This example illustrates the Representative Sampling strategy.
 6 | """
 7 | 
 8 | import logging
 9 | 
10 | import torch
11 | 
12 | # Dataset and machine learning model
13 | from utils.datasets import BreastCancerDataset
14 | from utils.ml_models import BreastCancerClassification
15 | 
16 | # Active Learning package
17 | from pyrelational.data_managers import DataManager
18 | from pyrelational.model_managers import LightningModelManager
19 | from pyrelational.oracles import BenchmarkOracle
20 | from pyrelational.pipeline import Pipeline
21 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import (
22 |     RepresentativeSamplingStrategy,
23 | )
24 | 
25 | # dataset
26 | dataset = BreastCancerDataset()
27 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [500, 30, 39])
28 | train_indices = train_ds.indices
29 | val_indices = val_ds.indices
30 | test_indices = test_ds.indices
31 | 
32 | # model_manager
33 | model_manager = LightningModelManager(
34 |     model_class=BreastCancerClassification, model_config={}, trainer_config={"epochs": 4}
35 | )
36 | 
37 | # data_manager and defining strategy
38 | data_manager = DataManager(
39 |     dataset=dataset,
40 |     train_indices=train_indices,
41 |     validation_indices=val_indices,
42 |     test_indices=test_indices,
43 |     loader_batch_size=100,
44 | )
45 | 
46 | # Setup
47 | strategy = RepresentativeSamplingStrategy(clustering_method="AffinityPropagation")
48 | oracle = BenchmarkOracle()
49 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
50 | 
51 | # Remove lightning prints
52 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
53 | 
54 | # performance with the full trainset labelled
55 | pipeline.compute_theoretical_performance()
56 | 
57 | # New data to be annotated, followed by an update of the data_manager and model
58 | to_annotate = pipeline.step(num_annotate=100)
59 | pipeline.query(indices=to_annotate)
60 | 
61 | # Annotating data step by step until the trainset is fully annotated
62 | pipeline.run(num_annotate=100)
63 | print(pipeline)
64 | 


--------------------------------------------------------------------------------
/examples/demo/mcdropout_uncertainty_classification.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a classification
 3 | task with the active learning library
 4 | 
 5 | This example will use uncertainty arising from the standard deviation of the
 6 | predictive distribution obtained via MCDropout
 7 | """
 8 | 
 9 | import logging
10 | 
11 | import torch
12 | 
13 | # Pytorch
14 | from torchvision import datasets, transforms
15 | 
16 | # Dataset and machine learning model
17 | from utils.ml_models import MnistClassification
18 | 
19 | # Active Learning package
20 | from pyrelational.data_managers import DataManager
21 | from pyrelational.model_managers import LightningMCDropoutModelManager
22 | from pyrelational.oracles import BenchmarkOracle
23 | from pyrelational.pipeline import Pipeline
24 | from pyrelational.strategies.classification import LeastConfidenceStrategy
25 | 
26 | # dataset
27 | dataset = datasets.FashionMNIST(root="data", train=True, download=True, transform=transforms.ToTensor())
28 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [9000, 25000, 26000])
29 | train_indices = train_ds.indices
30 | val_indices = val_ds.indices
31 | test_indices = test_ds.indices
32 | 
33 | # model
34 | model_manager = LightningMCDropoutModelManager(
35 |     model_class=MnistClassification, model_config={"dropout": 0.2}, trainer_config={"epochs": 4}
36 | )
37 | 
38 | # data_manager and defining strategy
39 | data_manager = DataManager(
40 |     dataset=dataset,
41 |     train_indices=train_indices,
42 |     validation_indices=val_indices,
43 |     test_indices=test_indices,
44 |     loader_batch_size=1000,
45 |     label_attr="targets",
46 | )
47 | 
48 | strategy = LeastConfidenceStrategy()
49 | oracle = BenchmarkOracle()
50 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
51 | 
52 | # Remove lightning prints
53 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
54 | 
55 | # performance with the full trainset labelled
56 | pipeline.compute_theoretical_performance()
57 | 
58 | # New data to be annotated, followed by an update of the data_manager and model
59 | to_annotate = pipeline.step(num_annotate=1000)
60 | pipeline.query(indices=to_annotate)
61 | 
62 | # Annotating data step by step until the trainset is fully annotated
63 | pipeline.run(num_annotate=1000)
64 | print(pipeline)
65 | 


--------------------------------------------------------------------------------
/examples/demo/mcdropout_uncertainty_regression.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a toy self-contained example of active learning on a regression
 3 | task with the active learning library
 4 | 
 5 | This example will use uncertainty arising from the standard deviation of the
 6 | predictive distribution obtained via MCDropout
 7 | """
 8 | 
 9 | import logging
10 | 
11 | import torch
12 | 
13 | # Dataset and machine learning model
14 | from utils.datasets import DiabetesDataset
15 | from utils.ml_models import DiabetesRegression
16 | 
17 | # Active Learning package
18 | from pyrelational.data_managers import DataManager
19 | from pyrelational.model_managers import LightningMCDropoutModelManager
20 | from pyrelational.oracles import BenchmarkOracle
21 | from pyrelational.pipeline import Pipeline
22 | from pyrelational.strategies.regression import VarianceReductionStrategy
23 | 
24 | # dataset
25 | dataset = DiabetesDataset()
26 | train_ds, val_ds, test_ds = torch.utils.data.random_split(dataset, [400, 22, 20])
27 | train_indices = train_ds.indices
28 | val_indices = val_ds.indices
29 | test_indices = test_ds.indices
30 | 
31 | # model_manager
32 | model_manager = LightningMCDropoutModelManager(
33 |     model_class=DiabetesRegression, model_config={}, trainer_config={"epochs": 4}
34 | )
35 | 
36 | # data_manager and defining strategy
37 | data_manager = DataManager(
38 |     dataset=dataset, train_indices=train_indices, validation_indices=val_indices, test_indices=test_indices
39 | )
40 | 
41 | 
42 | strategy = VarianceReductionStrategy()
43 | oracle = BenchmarkOracle()
44 | pipeline = Pipeline(data_manager=data_manager, model_manager=model_manager, strategy=strategy, oracle=oracle)
45 | 
46 | # Remove lightning prints
47 | logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
48 | 
49 | # performance with the full trainset labelled
50 | pipeline.compute_theoretical_performance()
51 | 
52 | # New data to be annotated, followed by an update of the data_manager and model
53 | to_annotate = pipeline.step(num_annotate=100)
54 | pipeline.query(indices=to_annotate)
55 | 
56 | # Annotating data step by step until the trainset is fully annotated
57 | pipeline.run(num_annotate=100)
58 | print(pipeline)
59 | 


--------------------------------------------------------------------------------
/examples/demo/utils/datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple datasets in PyTorch to use in examples
 3 | """
 4 | 
 5 | import torch
 6 | from sklearn.datasets import load_breast_cancer, load_diabetes
 7 | from torch.utils.data import Dataset
 8 | 
 9 | 
10 | class DiabetesDataset(Dataset):
11 |     """A small regression dataset for examples"""
12 | 
13 |     def __init__(self):
14 |         # Load the diabetes dataset
15 |         diabetes_X, diabetes_y = load_diabetes(return_X_y=True)
16 |         self.x = torch.FloatTensor(diabetes_X)
17 |         self.y = torch.FloatTensor(diabetes_y)
18 | 
19 |     def __len__(self):
20 |         return self.x.shape[0]
21 | 
22 |     def __getitem__(self, idx):
23 |         return self.x[idx], self.y[idx]
24 | 
25 | 
26 | class BreastCancerDataset(Dataset):
27 |     """A small classification dataset for examples"""
28 | 
29 |     def __init__(self):
30 |         super(BreastCancerDataset, self).__init__()
31 |         sk_x, sk_y = load_breast_cancer(return_X_y=True)
32 |         self.x = torch.FloatTensor(sk_x)
33 |         self.y = torch.LongTensor(sk_y)
34 | 
35 |     def __len__(self):
36 |         return self.x.shape[0]
37 | 
38 |     def __getitem__(self, idx):
39 |         return self.x[idx], self.y[idx]
40 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 120
3 | target-version = ['py38', 'py39']
4 | include = '\.pyi?$'
5 | exclude = '''
6 | /(
7 | )/
8 | '''
9 | 


--------------------------------------------------------------------------------
/pyrelational/__init__.py:
--------------------------------------------------------------------------------
1 | import pyrelational.data_managers
2 | import pyrelational.informativeness
3 | import pyrelational.model_managers
4 | import pyrelational.pipeline
5 | import pyrelational.strategies
6 | from pyrelational.version import __version__
7 | 


--------------------------------------------------------------------------------
/pyrelational/batch_mode_samplers/__init__.py:
--------------------------------------------------------------------------------
1 | """Query samplers module."""
2 | 
3 | from pyrelational.batch_mode_samplers._batch_mode_samplers import (
4 |     BatchModeSampler,
5 |     ProbabilisticSampler,
6 |     TopKSampler,
7 | )
8 | 


--------------------------------------------------------------------------------
/pyrelational/batch_mode_samplers/_batch_mode_samplers.py:
--------------------------------------------------------------------------------
 1 | """Collection of samplers for active learning strategies."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import List
 5 | 
 6 | import torch
 7 | from torch import Tensor
 8 | 
 9 | 
10 | class BatchModeSampler(ABC):
11 |     """Abstract sampler class."""
12 | 
13 |     @abstractmethod
14 |     def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]:
15 |         """Sample a subset of indices based on the scores.
16 | 
17 |         This method should be implemented by the subclasses.
18 |         :return: List of sampled indices.
19 |         """
20 |         pass
21 | 
22 | 
23 | class TopKSampler(BatchModeSampler):
24 |     """Deterministic sampler based on the top-k scores."""
25 | 
26 |     def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]:
27 |         """Sample the top-k indices based on the scores.
28 | 
29 |         :return: List of sampled indices.
30 |         """
31 |         ixs = torch.argsort(scores, descending=True).tolist()
32 |         return [indices[i] for i in ixs[:num_samples]]
33 | 
34 | 
35 | class ProbabilisticSampler(BatchModeSampler):
36 |     """Probabilistic sampler based on the scores."""
37 | 
38 |     def __call__(self, scores: Tensor, indices: List[int], num_samples: int) -> List[int]:
39 |         """Sample a subset of indices deriving a distribution from the scores.
40 | 
41 |         :return: List of sampled indices.
42 |         """
43 |         num_samples = min(num_samples, len(indices))
44 |         return [indices[i] for i in torch.multinomial(scores, num_samples, replacement=False).tolist()]
45 | 


--------------------------------------------------------------------------------
/pyrelational/data_managers/__init__.py:
--------------------------------------------------------------------------------
1 | from pyrelational.data_managers.data_manager import DataManager
2 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.datasets.classification import (
 2 |     MNIST,
 3 |     BreastCancerDataset,
 4 |     Checkerboard2x2Dataset,
 5 |     Checkerboard4x4Dataset,
 6 |     CreditCardDataset,
 7 |     DigitDataset,
 8 |     FashionMNIST,
 9 |     GaussianCloudsDataset,
10 |     StriatumDataset,
11 |     SynthClass1,
12 |     SynthClass2,
13 |     SynthClass3,
14 |     UCIGlass,
15 |     UCIParkinsons,
16 |     UCISeeds,
17 | )
18 | from pyrelational.datasets.drugcomb import DrugCombDataset
19 | from pyrelational.datasets.regression import (
20 |     DiabetesDataset,
21 |     SynthReg1,
22 |     SynthReg2,
23 |     UCIAirfoil,
24 |     UCIConcrete,
25 |     UCIEnergy,
26 |     UCIPower,
27 |     UCIWine,
28 |     UCIYacht,
29 | )
30 | from pyrelational.datasets.uci_datasets import UCIDatasets
31 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | from torch import Tensor
 4 | from torch.utils.data import Dataset
 5 | 
 6 | 
 7 | class BaseDataset(Dataset[Tuple[Tensor, Tensor]]):
 8 |     """A base class for all datasets to inherit from.
 9 | 
10 |     :param n_splits: Number of splits for cross-validation.
11 |     :param random_seed: Seed for random number generator for reproducibility.
12 |     """
13 | 
14 |     x: Tensor
15 |     y: Tensor
16 | 
17 |     def __init__(self, n_splits: int = 3, random_seed: int = 42):
18 |         """Initialize the BaseDataset with the number of splits and a seed for reproducibility.
19 | 
20 |         :param n_splits: Number of splits for stratified k-fold.
21 |         :param random_seed: Random seed for reproducibility.
22 |         """
23 |         super(BaseDataset, self).__init__()
24 |         self.n_splits = n_splits
25 |         self.random_seed = random_seed
26 | 
27 |     def __len__(self) -> int:
28 |         """Return the total number of samples in the dataset.
29 | 
30 |         :return: Total number of samples.
31 |         """
32 |         return len(self.x)
33 | 
34 |     def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
35 |         """Fetch the sample and its corresponding label at the given index.
36 | 
37 |         :param idx: Index of the sample to retrieve.
38 |         :return: Tuple containing the sample and its label.
39 |         """
40 |         return self.x[idx], self.y[idx]
41 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.datasets.classification.andrea_et_al import CreditCardDataset
 2 | from pyrelational.datasets.classification.fashion_mnist import FashionMNIST
 3 | from pyrelational.datasets.classification.ksenia_et_al import (
 4 |     Checkerboard2x2Dataset,
 5 |     Checkerboard4x4Dataset,
 6 |     GaussianCloudsDataset,
 7 |     StriatumDataset,
 8 | )
 9 | from pyrelational.datasets.classification.mnist import MNIST
10 | from pyrelational.datasets.classification.scikit_learn import (
11 |     BreastCancerDataset,
12 |     DigitDataset,
13 | )
14 | from pyrelational.datasets.classification.synthetic import (
15 |     SynthClass1,
16 |     SynthClass2,
17 |     SynthClass3,
18 | )
19 | from pyrelational.datasets.classification.uci import UCIGlass, UCIParkinsons, UCISeeds
20 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/andrea_et_al.py:
--------------------------------------------------------------------------------
 1 | import pyreadr
 2 | import torch
 3 | 
 4 | from pyrelational.datasets.base import BaseDataset
 5 | from pyrelational.datasets.download_utils import download_file
 6 | 
 7 | from .utils import create_splits, remap_to_int
 8 | 
 9 | 
10 | class CreditCardDataset(BaseDataset):
11 |     """Credit card fraud dataset, highly unbalanced and challenging.
12 | 
13 |     From Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson, and Gianluca Bontempi.
14 |     Calibrating probability with undersampling for unbalanced classification. In 2015
15 |     IEEE Symposium Series on Computational Intelligence, pages 159–166, 2015.
16 | 
17 |     We use the original data from http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata
18 |     processed using pyreadr
19 | 
20 |     :param data_dir: path where to save the raw data default to /tmp/
21 |     :param n_splits: an int describing the number of class stratified
22 |             splits to compute
23 |     :param random_seed: random seed for reproducibility on splits
24 |     """
25 | 
26 |     raw_url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
27 | 
28 |     def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0):
29 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
30 |         self.data_dir = data_dir
31 |         self.n_splits = n_splits
32 |         self._load_dataset()
33 | 
34 |     def _load_dataset(self) -> None:
35 |         download_file(self.raw_url, self.data_dir)
36 |         file_name = self.raw_url.split("/")[-1]
37 |         data = pyreadr.read_r(self.data_dir + file_name)
38 | 
39 |         data = data["creditcard"]
40 |         data.reset_index(inplace=True)
41 |         xcols = data.columns[1:-1]
42 |         self.x = torch.from_numpy(data[xcols].to_numpy()).float()
43 |         self.y = remap_to_int(torch.from_numpy(data["Class"].to_numpy().astype(int)))
44 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
45 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/fashion_mnist.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | from torch import Tensor
 5 | from torch.utils.data import ConcatDataset
 6 | from torchvision import datasets, transforms
 7 | 
 8 | from pyrelational.datasets.base import BaseDataset
 9 | from pyrelational.datasets.classification.utils import create_splits
10 | 
11 | 
12 | class FashionMNIST(BaseDataset):
13 |     """Fashion MNIST dataset class that handles downloading, transforming, and loading Fashion MNIST data.
14 | 
15 |     This dataset includes images from 10 categories of clothing, each represented as a 28x28 grayscale image.
16 |     :param data_dir: Directory to store or read the Fashion MNIST data.
17 |     :param n_splits: Number of stratified splits for the dataset.
18 |     :param random_seed: Seed for random number generator for reproducibility.
19 |     """
20 | 
21 |     def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 1234):
22 |         """Instantiate the FashionMNIST dataset class.
23 | 
24 |         :param data_dir: directory where to download the data, defaults to "/tmp/"
25 |         :param n_splits: number of splits to generate, defaults to 5
26 |         :param random_seed: random seed, defaults to 1234
27 |         """
28 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
29 |         self.data_dir = data_dir
30 |         self._load_data()
31 | 
32 |     def _load_data(self) -> None:
33 |         """Load the Fashion MNIST dataset from torchvision datasets.
34 | 
35 |         We apply a transformation to convert images to tensors, and concatenates the train and test datasets into
36 |         a single dataset for unified handling.
37 |         """
38 |         train_dataset = datasets.FashionMNIST(
39 |             root=self.data_dir, train=True, download=True, transform=transforms.ToTensor()
40 |         )
41 |         test_dataset = datasets.FashionMNIST(
42 |             root=self.data_dir, train=False, download=True, transform=transforms.ToTensor()
43 |         )
44 | 
45 |         # Concatenate the train and test datasets
46 |         self.full_dataset: ConcatDataset[Tuple[Tensor, Tensor]] = ConcatDataset([train_dataset, test_dataset])
47 |         self.x = torch.stack([(self.full_dataset[i][0]).flatten() for i in range(len(self.full_dataset))])
48 |         self.y = torch.stack([torch.tensor(self.full_dataset[i][1]) for i in range(len(self.full_dataset))])
49 | 
50 |         # Create splits for cross-validation
51 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
52 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/mnist.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from torch import Tensor
 6 | from torch.utils.data import ConcatDataset
 7 | from torchvision import datasets, transforms
 8 | 
 9 | from pyrelational.datasets.base import BaseDataset
10 | 
11 | 
12 | class MNIST(BaseDataset):
13 |     """
14 |     MNIST dataset class that handles downloading, transforming, and loading MNIST data.
15 | 
16 |     :param data_dir: Directory to store or read the MNIST data.
17 |     :param n_splits: Number of stratified splits for the dataset.
18 |     :param random_seed: Seed for random number generator for reproducibility.
19 |     """
20 | 
21 |     def __init__(self, data_dir: str = "/tmp/", random_seed: int = 1234):
22 |         """Instantiate the MNIST dataset class.
23 | 
24 |         :param data_dir: directory where to download the data, defaults to "/tmp/"
25 |         :param random_seed: random seed, defaults to 1234
26 |         """
27 |         super().__init__(random_seed=random_seed)
28 |         self.data_dir = data_dir
29 |         self._load_data()
30 | 
31 |     def _load_data(self) -> None:
32 |         """Load the MNIST dataset from torchvision datasets.
33 | 
34 |         We apply the standard transformation with tensor conversation and normalisation.
35 |         We concatenate the train and test datasets into a single dataset for unified handling, but
36 |         we keep the same fixed test set.
37 |         """
38 |         transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
39 |         train_dataset = datasets.MNIST(root=self.data_dir, train=True, download=True, transform=transform)
40 |         test_dataset = datasets.MNIST(root=self.data_dir, train=False, download=True, transform=transform)
41 | 
42 |         # Concatenate the train and test datasets
43 |         self.full_dataset: ConcatDataset[Tuple[Tensor, Tensor]] = ConcatDataset([train_dataset, test_dataset])
44 |         self.x = torch.stack([(self.full_dataset[i][0]) for i in range(len(self.full_dataset))])
45 |         self.y = torch.stack([torch.tensor(self.full_dataset[i][1]) for i in range(len(self.full_dataset))])
46 | 
47 |         # Create splits for cross-validation
48 |         train_ix, test_ix = np.arange(len(train_dataset)), np.arange(len(test_dataset)) + len(train_dataset)
49 |         self.data_splits = [(train_ix, test_ix)]
50 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/scikit_learn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from sklearn.datasets import load_breast_cancer, load_digits
 3 | 
 4 | from pyrelational.datasets.base import BaseDataset
 5 | 
 6 | from .utils import create_splits
 7 | 
 8 | 
 9 | class BreastCancerDataset(BaseDataset):
10 |     """
11 |     UCI ML Breast Cancer Wisconsin (Diagnostic) dataset handler.
12 | 
13 |     This dataset features measurements from digitized images of breast mass and uses these features to classify
14 |     the observations as benign or malignant.
15 | 
16 |     :param n_splits: Number of stratified splits for cross-validation.
17 |     :param random_seed: Seed for random number generator for reproducibility.
18 |     """
19 | 
20 |     def __init__(self, n_splits: int = 5, random_seed: int = 0):
21 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
22 |         self._load_data()
23 | 
24 |     def _load_data(self) -> None:
25 |         """
26 |         Load and preprocess the Breast Cancer dataset. This method handles the conversion of the dataset into tensors
27 |         suitable for model input and sets up splits.
28 |         """
29 |         data, labels = load_breast_cancer(return_X_y=True)
30 |         self.x = torch.tensor(data, dtype=torch.float)
31 |         self.y = torch.tensor(labels, dtype=torch.long)
32 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
33 | 
34 | 
35 | class DigitDataset(BaseDataset):
36 |     """UCI ML hand-written digits datasets
37 | 
38 |     From C. Kaynak (1995) Methods of Combining Multiple Classifiers and
39 |     Their Applications to Handwritten Digit Recognition, MSc Thesis,
40 |     Institute of Graduate Studies in Science and Engineering, Bogazici
41 |     University.
42 | 
43 |     :param n_splits: an int describing the number of class stratified
44 |             splits to compute
45 |     :param random_seed: int setting the random seed for reproducibility
46 |     """
47 | 
48 |     def __init__(self, n_splits: int = 5, random_seed: int = 0):
49 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
50 |         self._load_data()
51 | 
52 |     def _load_data(self) -> None:
53 |         """
54 |         Load and preprocess the Digit dataset. This method handles the conversion of the dataset into tensors
55 |         suitable for model input and sets up splits.
56 |         """
57 |         sk_x, sk_y = load_digits(return_X_y=True)
58 |         self.x = torch.FloatTensor(sk_x)
59 |         self.y = torch.LongTensor(sk_y)
60 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
61 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/uci.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from pyrelational.datasets.base import BaseDataset
 4 | from pyrelational.datasets.uci_datasets import UCIDatasets
 5 | 
 6 | from .utils import remap_to_int
 7 | 
 8 | 
 9 | class UCIClassification(BaseDataset):
10 |     """
11 |     A generic class for handling UCI datasets, providing mechanisms to download, preprocess, and split the dataset.
12 | 
13 |     :param name: Identifier for the UCI dataset.
14 |     :param data_dir: Directory where datasets are stored or will be downloaded.
15 |     :param n_splits: Number of stratified splits for cross-validation.
16 |     :param random_seed: Random seed for reproducibility of splits.
17 |     """
18 | 
19 |     def __init__(self, name: str, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0):
20 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
21 |         self.data_dir = data_dir
22 |         self.dataset = UCIDatasets(name=name, data_dir=data_dir, n_splits=n_splits, random_seed=random_seed)
23 |         self._load_data()
24 | 
25 |     def _load_data(self) -> None:
26 |         """
27 |         Load and preprocess the dataset. This involves loading the data using UCIDatasets,
28 |         possibly transforming it, and preparing it for model training.
29 |         """
30 |         data, labels = self.dataset.get_data()
31 |         self.x = torch.tensor(data, dtype=torch.float)
32 |         self.y = torch.tensor(labels, dtype=torch.long)
33 |         self.y = remap_to_int(self.y)
34 |         self.name = self.dataset.name
35 |         self.data_splits = self.dataset.data_splits
36 | 
37 | 
38 | class UCIGlass(UCIClassification):
39 |     """
40 |     UCI Glass dataset for classification tasks.
41 | 
42 |     Inherits from UCIClassification and uses its mechanisms to load and preprocess the Glass dataset specifically.
43 |     """
44 | 
45 |     def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0):
46 |         super().__init__(name="glass", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed)
47 | 
48 | 
49 | class UCIParkinsons(UCIClassification):
50 |     """
51 |     UCI Parkinsons dataset for classification tasks.
52 | 
53 |     Inherits from UCIClassification and uses its mechanisms to load and
54 |     preprocess the Parkinsons dataset specifically.
55 |     """
56 | 
57 |     def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0):
58 |         super().__init__(name="parkinsons", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed)
59 | 
60 | 
61 | class UCISeeds(UCIClassification):
62 |     """
63 |     UCI Seeds dataset for classification tasks.
64 | 
65 |     Inherits from UCIClassification and uses its mechanisms to load and
66 |     preprocess the Seeds dataset specifically.
67 |     """
68 | 
69 |     def __init__(self, data_dir: str = "/tmp/", n_splits: int = 5, random_seed: int = 0):
70 |         super().__init__(name="seeds", data_dir=data_dir, n_splits=n_splits, random_seed=random_seed)
71 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/classification/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import numpy as np
 4 | from numpy.typing import NDArray
 5 | from sklearn.model_selection import StratifiedKFold
 6 | from torch import Tensor
 7 | 
 8 | 
 9 | def remap_to_int(torch_class_array: Tensor) -> Tensor:
10 |     """Remap the elements in a torch tensor to contiguous integers starting from 0.
11 | 
12 |     This is useful for classification tasks where class labels should start from zero and be contiguous.
13 |     :param torch_class_array: A torch.Tensor containing class labels, possibly non-integer or non-contiguous.
14 |     :return: A torch.Tensor with class labels remapped to integers starting from 0.
15 | 
16 |     Example:
17 |         >>> torch_class_array = torch.tensor([10, 10, 20, 20, 30])
18 |         >>> remap_to_int(torch_class_array)
19 |         tensor([0, 0, 1, 1, 2])
20 |     """
21 |     remapped_labels: Tensor = torch_class_array.unique(return_inverse=True)[1]
22 |     return remapped_labels
23 | 
24 | 
25 | def create_splits(
26 |     x: Tensor, y: Tensor, n_splits: int, random_seed: int
27 | ) -> List[Tuple[NDArray[np.int_], NDArray[np.int_]]]:
28 |     """Create stratified k-fold splits for the dataset using the dataset's features and labels."""
29 |     skf = StratifiedKFold(n_splits=n_splits, random_state=random_seed, shuffle=True)
30 |     return list(skf.split(x.numpy(), y.numpy()))
31 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/regression/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.datasets.regression.scikit_learn import DiabetesDataset
 2 | from pyrelational.datasets.regression.synthetic import SynthReg1, SynthReg2
 3 | from pyrelational.datasets.regression.uci import (
 4 |     UCIAirfoil,
 5 |     UCIConcrete,
 6 |     UCIEnergy,
 7 |     UCIPower,
 8 |     UCIWine,
 9 |     UCIYacht,
10 | )
11 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/regression/scikit_learn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from sklearn.datasets import load_diabetes
 3 | 
 4 | from pyrelational.datasets.base import BaseDataset
 5 | 
 6 | from .utils import create_splits
 7 | 
 8 | 
 9 | class DiabetesDataset(BaseDataset):
10 |     """A small regression dataset for examples
11 | 
12 |     From Bradley Efron, Trevor Hastie, Iain Johnstone and
13 |     Robert Tibshirani (2004) “Least Angle Regression,”
14 |     Annals of Statistics (with discussion), 407-499.
15 | 
16 |     :param n_splits: an int describing the number of class stratified
17 |         splits to compute
18 |     """
19 | 
20 |     def __init__(self, n_splits: int = 5, random_seed: int = 0):
21 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
22 |         # Load the diabetes dataset
23 |         diabetes_X, diabetes_y = load_diabetes(return_X_y=True)
24 |         self.x = torch.FloatTensor(diabetes_X)
25 |         self.y = torch.FloatTensor(diabetes_y)
26 | 
27 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
28 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/regression/synthetic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from sklearn.datasets import make_regression
 4 | 
 5 | from pyrelational.datasets.base import BaseDataset
 6 | 
 7 | from .utils import create_splits
 8 | 
 9 | 
10 | class SynthReg1(BaseDataset):
11 |     """Synthetic dataset for active learning on a regression based task
12 | 
13 |     Simple 1 dof regression problem that can be placed into two types
14 |     of AL situations as described in the module docstring
15 | 
16 |     :param n_splits: an int describing the number of class stratified
17 |             splits to compute
18 |     :param size: an int describing the number of observations the dataset
19 |             is to have
20 |     :param random_seed: random seed for reproducibility on splits
21 |     """
22 | 
23 |     def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234):
24 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
25 |         self._create_data(size, random_seed)
26 | 
27 |     def _create_data(self, size: int, random_seed: int) -> None:
28 |         x, y = make_regression(
29 |             n_samples=size,
30 |             n_features=1,
31 |             n_targets=1,
32 |             random_state=random_seed,
33 |         )
34 | 
35 |         self.x = torch.FloatTensor(x)
36 |         self.y = torch.FloatTensor(y)
37 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
38 | 
39 | 
40 | class SynthReg2(BaseDataset):
41 |     """Synthetic dataset for active learning on a regression based task
42 | 
43 |     A more challenging dataset than SynthReg1 wherein we see a periodic
44 |     pattern with 2 degrees of freedom.
45 | 
46 |     :param n_splits: an int describing the number of class stratified
47 |             splits to compute
48 |     :param size: an int describing the number of observations the dataset
49 |             is to have
50 |     :param random_seed: random seed for reproducibility on splits
51 |     """
52 | 
53 |     def __init__(self, n_splits: int = 5, size: int = 1000, random_seed: int = 1234):
54 |         super().__init__(n_splits=n_splits, random_seed=random_seed)
55 |         self._create_data(size)
56 | 
57 |     def _create_data(self, size: int) -> None:
58 |         zdata = 15 * np.random.random(size)
59 |         xdata = np.sin(zdata) + 0.1 * np.random.randn(size)
60 |         ydata = np.cos(zdata) + 0.1 * np.random.randn(size)
61 | 
62 |         zdata = torch.FloatTensor(zdata)
63 |         xdata = torch.FloatTensor(xdata)
64 |         ydata = torch.FloatTensor(ydata)
65 | 
66 |         self.x = torch.vstack([zdata, xdata]).T
67 |         self.y = ydata
68 |         self.data_splits = create_splits(self.x, self.y, self.n_splits, self.random_seed)
69 | 


--------------------------------------------------------------------------------
/pyrelational/datasets/regression/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | from numpy.typing import NDArray
 5 | from sklearn.model_selection import KFold
 6 | from torch import Tensor
 7 | 
 8 | 
 9 | def create_splits(x: Tensor, y: Tensor, n_splits: int, random_seed: int) -> List[NDArray[np.int_]]:
10 |     """
11 |     Create stratified k-fold splits for the dataset using the dataset's features and labels.
12 |     """
13 |     skf = KFold(n_splits=n_splits, random_state=random_seed, shuffle=True)
14 |     return list(skf.split(x.numpy(), y.numpy()))
15 | 


--------------------------------------------------------------------------------
/pyrelational/informativeness/__init__.py:
--------------------------------------------------------------------------------
 1 | """Information acquisition strategies for active learning."""
 2 | 
 3 | from pyrelational.informativeness.classification_scorers import (
 4 |     ClassificationBald,
 5 |     Entropy,
 6 |     LeastConfidence,
 7 |     MarginConfidence,
 8 |     RatioConfidence,
 9 | )
10 | from pyrelational.informativeness.regression_scorers import (
11 |     AverageScorer,
12 |     ExpectedImprovement,
13 |     RegressionBald,
14 |     StandardDeviation,
15 |     ThompsonSampling,
16 |     UpperConfidenceBound,
17 | )
18 | from pyrelational.informativeness.task_agnostic_scorers import RelativeDistanceScorer
19 | 
20 | __all__ = [
21 |     "AverageScorer",
22 |     "StandardDeviation",
23 |     "ThompsonSampling",
24 |     "RegressionBald",
25 |     "ExpectedImprovement",
26 |     "UpperConfidenceBound",
27 |     "Entropy",
28 |     "LeastConfidence",
29 |     "MarginConfidence",
30 |     "RatioConfidence",
31 |     "ClassificationBald",
32 |     "RelativeDistanceScorer",
33 | ]
34 | 


--------------------------------------------------------------------------------
/pyrelational/informativeness/decorators.py:
--------------------------------------------------------------------------------
 1 | """Decorators for checking input shapes and types for scorers."""
 2 | 
 3 | from functools import wraps
 4 | from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 5 | 
 6 | import torch
 7 | from torch import Tensor
 8 | from torch.distributions import Distribution
 9 | 
10 | if TYPE_CHECKING:
11 |     from pyrelational.informativeness.abstract_scorers import (
12 |         AbstractClassificationScorer,
13 |         AbstractRegressionScorer,
14 |     )
15 | 
16 | 
17 | def require_probabilities(func: Callable[..., Tensor]) -> Callable[..., Tensor]:
18 |     """Ensure that the input tensor is a probability distribution."""
19 | 
20 |     @wraps(func)
21 |     def wrapper(self: "AbstractClassificationScorer", prob_dist: Tensor) -> Tensor:
22 |         """Check the input tensor sums to 1 along axis."""
23 |         assert torch.allclose(
24 |             prob_dist.sum(self.axis), torch.tensor(1.0)
25 |         ), "input should be probability distributions along specified axis"
26 |         return func(self, prob_dist)
27 | 
28 |     return wrapper
29 | 
30 | 
31 | def check_regression_input(func: Callable[..., Tensor]) -> Callable[..., Tensor]:
32 |     """Check inputs for regression scoring functions."""
33 | 
34 |     @wraps(func)
35 |     def wrapper(
36 |         self: "AbstractRegressionScorer", x: Optional[Union[Tensor, Distribution]] = None, **kwargs: Any
37 |     ) -> Tensor:
38 |         """Check shapes of input tensors."""
39 |         mean = kwargs.get("mean", None)
40 |         std = kwargs.get("std", None)
41 |         if x is None and mean is None and std is None:
42 |             raise ValueError("At least one of x, mean, or std must be provided.")
43 | 
44 |         if isinstance(x, Tensor):
45 |             assert 2 <= x.ndim <= 3, "x input should be a 2D or 3D tensor"
46 | 
47 |         if isinstance(mean, Tensor):
48 |             assert 1 <= mean.ndim <= 2, "mean input should be a 1D or 2D tensor"
49 | 
50 |         if isinstance(std, Tensor):
51 |             assert 1 <= std.ndim <= 2, "std input should be a 1D or 2D tensor"
52 | 
53 |         return func(self, x, **kwargs)
54 | 
55 |     return wrapper
56 | 


--------------------------------------------------------------------------------
/pyrelational/model_managers/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.model_managers.abstract_model_manager import ModelManager
 2 | from pyrelational.model_managers.ensemble_model_manager import (
 3 |     EnsembleModelManager,
 4 |     LightningEnsembleModelManager,
 5 | )
 6 | from pyrelational.model_managers.lightning_model_manager import LightningModelManager
 7 | from pyrelational.model_managers.mcdropout_model_manager import (
 8 |     LightningMCDropoutModelManager,
 9 |     MCDropoutModelManager,
10 | )
11 | 


--------------------------------------------------------------------------------
/pyrelational/model_managers/model_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | import torch
 4 | from lightning.pytorch.trainer.connectors.accelerator_connector import (
 5 |     _AcceleratorConnector,
 6 | )
 7 | 
 8 | 
 9 | def _determine_device(trainer_config: Dict[str, Any]) -> torch.device:
10 |     """
11 |     Determines the torch device of the model from the arguments for the pytorch lightning trainer
12 | 
13 |     :param trainer_config: configuration dictionary for a pytorch lightning Trainer
14 |     :return: torch device object
15 |     """
16 |     accelerator = _AcceleratorConnector(
17 |         accelerator=trainer_config.get("accelerator", "cpu"), devices=trainer_config.get("devices", "auto")
18 |     )
19 |     device: torch.device = accelerator.strategy.root_device
20 |     return device
21 | 


--------------------------------------------------------------------------------
/pyrelational/oracles/__init__.py:
--------------------------------------------------------------------------------
1 | from pyrelational.oracles.abstract_oracle import Oracle
2 | from pyrelational.oracles.benchmark_oracle import BenchmarkOracle
3 | 


--------------------------------------------------------------------------------
/pyrelational/oracles/benchmark_oracle.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from pyrelational.data_managers.data_manager import DataManager
 4 | 
 5 | from .abstract_oracle import Oracle
 6 | 
 7 | 
 8 | class BenchmarkOracle(Oracle):
 9 |     """
10 |     An oracle designed for evaluating strategies in R&D settings,
11 |     it assumes that all the observations are sufficiently annotated and
12 |     returns those annotations when queried.
13 |     """
14 | 
15 |     def __init__(self) -> None:
16 |         super(BenchmarkOracle, self).__init__()
17 | 
18 |     def query_target_value(self, data_manager: DataManager, idx: int) -> Any:
19 |         """Default method is to simply return the target in the dataset
20 | 
21 |         :param data_manager: reference to the data_manager which will load the observation if necessary
22 |         :param idx: index to observation which we want to query an annotation
23 | 
24 |         :return: the output of the oracle (the target value already in the dataset)
25 |         """
26 |         target_value = data_manager[idx][-1]
27 |         return target_value
28 | 


--------------------------------------------------------------------------------
/pyrelational/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from pyrelational.pipeline.pipeline import Pipeline
2 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | import pyrelational.strategies.classification
2 | import pyrelational.strategies.regression
3 | import pyrelational.strategies.task_agnostic
4 | from pyrelational.strategies.abstract_strategy import Strategy
5 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.strategies.classification.classification_strategy import (
 2 |     ClassificationStrategy,
 3 | )
 4 | from pyrelational.strategies.classification.entropy_classification_strategy import (
 5 |     EntropyClassificationStrategy,
 6 | )
 7 | from pyrelational.strategies.classification.least_confidence_strategy import (
 8 |     LeastConfidenceStrategy,
 9 | )
10 | from pyrelational.strategies.classification.marginal_confidence_strategy import (
11 |     MarginalConfidenceStrategy,
12 | )
13 | from pyrelational.strategies.classification.ratio_confidence_strategy import (
14 |     RatioConfidenceStrategy,
15 | )
16 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/classification_strategy.py:
--------------------------------------------------------------------------------
 1 | """ClassificationStrategy class for active learning in classification tasks."""
 2 | 
 3 | import math
 4 | from typing import Any, List
 5 | 
 6 | import torch
 7 | from torch import Tensor
 8 | 
 9 | from pyrelational.data_managers import DataManager
10 | from pyrelational.model_managers import ModelManager
11 | from pyrelational.strategies.abstract_strategy import Strategy
12 | 
13 | 
14 | class ClassificationStrategy(Strategy):
15 |     """A base active learning strategy class for classification."""
16 | 
17 |     def __call__(
18 |         self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any]
19 |     ) -> List[int]:
20 |         """
21 |         Identify samples for labelling based on user defined scoring and sampling function.
22 | 
23 |         :param num_annotate: number of samples to annotate
24 |         :param data_manager: A pyrelational data manager
25 |             which keeps track of what has been labelled and creates data loaders for
26 |             active learning
27 |         :param model_manager: A pyrelational model manager
28 |             which wraps a user defined ML model to handle instantiation, training, testing,
29 |             as well as uncertainty quantification
30 | 
31 |         :return: list of indices to annotate
32 |         """
33 |         output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager).mean(0)
34 |         if not torch.allclose(output.sum(1), torch.tensor(1.0)):
35 |             output = softmax(output)
36 |         uncertainty = self.scorer(output)
37 |         return self.sampler(uncertainty, data_manager.u_indices, num_annotate)
38 | 
39 | 
40 | def softmax(scores: Tensor, base: float = math.e, axis: int = -1) -> Tensor:
41 |     """Return softmax array for array of scores.
42 | 
43 |     Converts a set of raw scores from a model (logits) into a
44 |     probability distribution via softmax.
45 | 
46 |     The probability distribution will be a set of real numbers
47 |     such that each is in the range 0-1.0 and the sum is 1.0.
48 | 
49 |     Assumes input is a pytorch tensor: tensor([1.0, 4.0, 2.0, 3.0])
50 | 
51 |     :param scores: (pytorch tensor) a pytorch tensor of any positive/negative real numbers.
52 |     :param base: the base for the exponential (default e)
53 |     :param: axis to apply softmax on scores
54 | 
55 |     :return: tensor of softmaxed scores
56 |     """
57 |     exps = base ** scores.float()  # exponential for each value in array
58 |     sum_exps = torch.sum(exps, dim=axis, keepdim=True)  # sum of all exponentials
59 |     prob_dist: Tensor = exps / sum_exps  # normalize exponentials
60 |     return prob_dist
61 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/entropy_classification_strategy.py:
--------------------------------------------------------------------------------
 1 | """Active learning using entropy based confidence uncertainty measure.
 2 | 
 3 | The score is computed between classes in the posterior predictive distribution to
 4 | choose which observations to propose to the oracle.
 5 | """
 6 | 
 7 | from pyrelational.batch_mode_samplers import TopKSampler
 8 | from pyrelational.informativeness import Entropy
 9 | from pyrelational.strategies.classification.classification_strategy import (
10 |     ClassificationStrategy,
11 | )
12 | 
13 | 
14 | class EntropyClassificationStrategy(ClassificationStrategy):
15 |     """Implements Entropy Classification Strategy."""
16 | 
17 |     def __init__(self, axis: int = -1):
18 |         """Initialise the strategy with entropy scorer and deterministic sampler."""
19 |         super().__init__(Entropy(axis=axis), TopKSampler())
20 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/least_confidence_strategy.py:
--------------------------------------------------------------------------------
 1 | """Active learning using least confidence uncertainty measure."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import LeastConfidence
 5 | from pyrelational.strategies.classification.classification_strategy import (
 6 |     ClassificationStrategy,
 7 | )
 8 | 
 9 | 
10 | class LeastConfidenceStrategy(ClassificationStrategy):
11 |     """Implements Least Confidence Strategy.
12 | 
13 |     Unlabelled samples are scored and queried based on the least confidence for classification scorer.
14 |     """
15 | 
16 |     def __init__(self, axis: int = -1):
17 |         """Initialize the strategy with the least confidence scorer and a deterministic scorer for classification."""
18 |         super().__init__(LeastConfidence(axis=axis), TopKSampler())
19 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/marginal_confidence_strategy.py:
--------------------------------------------------------------------------------
 1 | """Active learning using marginal confidence uncertainty measure."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import MarginConfidence
 5 | from pyrelational.strategies.classification.classification_strategy import (
 6 |     ClassificationStrategy,
 7 | )
 8 | 
 9 | 
10 | class MarginalConfidenceStrategy(ClassificationStrategy):
11 |     """Implements Marginal Confidence Strategy.
12 | 
13 |     Unlabelled samples are scored and queried based on the marginal confidence for classification scorer.
14 |     """
15 | 
16 |     def __init__(self, axis: int = -1):
17 |         """Initialize the strategy with the marginal confidence scorer and a deterministic scorer for classification."""
18 |         super().__init__(MarginConfidence(axis=axis), TopKSampler())
19 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/classification/ratio_confidence_strategy.py:
--------------------------------------------------------------------------------
 1 | """Active learning using ratio based confidence uncertainty measure."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import RatioConfidence
 5 | from pyrelational.strategies.classification.classification_strategy import (
 6 |     ClassificationStrategy,
 7 | )
 8 | 
 9 | 
10 | class RatioConfidenceStrategy(ClassificationStrategy):
11 |     """Implements Ratio Confidence Strategy.
12 | 
13 |     Unlabelled samples are scored and queried based on the ratio confidence for classification scorer.
14 |     """
15 | 
16 |     def __init__(self, axis: int = -1):
17 |         """Initialize the strategy with the ratio confidence scorer and a deterministic scorer for classification."""
18 |         super().__init__(RatioConfidence(axis=axis), TopKSampler())
19 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/__init__.py:
--------------------------------------------------------------------------------
 1 | """Regression strategies for active learning."""
 2 | 
 3 | from pyrelational.strategies.regression.bald_strategy import (
 4 |     BALDStrategy,
 5 |     SoftBALDStrategy,
 6 | )
 7 | from pyrelational.strategies.regression.expected_improvement_strategy import (
 8 |     ExpectedImprovementStrategy,
 9 | )
10 | from pyrelational.strategies.regression.greedy_strategy import GreedyStrategy
11 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
12 | from pyrelational.strategies.regression.thompson_sampling_strategy import (
13 |     ThompsonSamplingStrategy,
14 | )
15 | from pyrelational.strategies.regression.upper_confidence_bound_strategy import (
16 |     UpperConfidenceBoundStrategy,
17 | )
18 | from pyrelational.strategies.regression.variance_reduction_strategy import (
19 |     VarianceReductionStrategy,
20 | )
21 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/bald_strategy.py:
--------------------------------------------------------------------------------
 1 | """BALD Strategy for regression tasks."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import ProbabilisticSampler, TopKSampler
 4 | from pyrelational.informativeness import RegressionBald
 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
 6 | 
 7 | 
 8 | class BALDStrategy(RegressionStrategy):
 9 |     """Implements BALD Strategy.
10 | 
11 |     Samples are queried based on mutual information score based on multiple estimator models.
12 |     """
13 | 
14 |     def __init__(self, axis: int = 0):
15 |         """Initialise the strategy with bald scorer and deterministic sampler."""
16 |         super().__init__(RegressionBald(axis=axis), TopKSampler())
17 | 
18 | 
19 | class SoftBALDStrategy(RegressionStrategy):
20 |     """Implements soft BALD Strategy.
21 | 
22 |     Unlabelled samples are queried based on mutual information score based on
23 |     multiple estimator models. In contrast to Bald the query is drawn from unlabelled pool based on probabilities
24 |     derived from bald scores instead of using an argmax operation.
25 |     """
26 | 
27 |     def __init__(self, axis: int = 0):
28 |         """Initialise the strategy with bald scorer and probabilistic sampler."""
29 |         super().__init__(RegressionBald(axis=axis), ProbabilisticSampler())
30 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/expected_improvement_strategy.py:
--------------------------------------------------------------------------------
 1 | """Implement Expected Improvement Strategy for regression tasks."""
 2 | 
 3 | from typing import Any, List
 4 | 
 5 | import torch
 6 | 
 7 | from pyrelational.batch_mode_samplers import TopKSampler
 8 | from pyrelational.data_managers import DataManager
 9 | from pyrelational.informativeness import ExpectedImprovement
10 | from pyrelational.model_managers import ModelManager
11 | from pyrelational.strategies.abstract_strategy import Strategy
12 | 
13 | 
14 | class ExpectedImprovementStrategy(Strategy):
15 |     """Implement Expected Improvement Strategy.
16 | 
17 |     Unlabelled sample is scored based on the expected improvement scoring function.
18 |     """
19 | 
20 |     scorer: ExpectedImprovement
21 | 
22 |     def __init__(self, xi: float = 0.01, axis: int = 0) -> None:
23 |         """Initialize the strategy with the expected improvement scorer and a deterministic sampler for regression."""
24 |         super().__init__(ExpectedImprovement(xi=xi, axis=axis), TopKSampler())
25 | 
26 |     def __call__(
27 |         self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any]
28 |     ) -> List[int]:
29 |         """
30 |         Identify samples which need to be labelled.
31 | 
32 |         :param num_annotate: number of samples to annotate
33 |         :param data_manager: A pyrelational data manager
34 |             which keeps track of what has been labelled and creates data loaders for
35 |             active learning
36 |         :param model_manager: A pyrelational model manager
37 |             which wraps a user defined ML model to handle instantiation, training, testing,
38 |             as well as uncertainty quantification
39 | 
40 |         :return: list of indices to annotate
41 |         """
42 |         output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager)
43 |         max_label = torch.max(data_manager.get_sample_labels(data_manager.l_indices))
44 |         uncertainty = self.scorer(output, max_label=max_label)
45 |         return self.sampler(uncertainty, data_manager.u_indices, num_annotate)
46 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/greedy_strategy.py:
--------------------------------------------------------------------------------
 1 | """Greedy Strategy Module."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import AverageScorer
 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
 6 | 
 7 | 
 8 | class GreedyStrategy(RegressionStrategy):
 9 |     """Implements Greedy Strategy.
10 | 
11 |     Unlabelled samples are queried based on their predicted mean value by the model.
12 |     ie samples with the highest predicted mean values are queried.
13 |     """
14 | 
15 |     def __init__(self, axis: int = 0):
16 |         """Initialize the strategy with the mean prediction scorer and a deterministic scorer for regression."""
17 |         super().__init__(AverageScorer(axis=axis), TopKSampler())
18 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/regression_strategy.py:
--------------------------------------------------------------------------------
 1 | """Regression strategy class implementing __call__ logic."""
 2 | 
 3 | from typing import Any, List
 4 | 
 5 | from pyrelational.data_managers import DataManager
 6 | from pyrelational.model_managers import ModelManager
 7 | from pyrelational.strategies.abstract_strategy import Strategy
 8 | 
 9 | 
10 | class RegressionStrategy(Strategy):
11 |     """A base active learning strategy class for regression."""
12 | 
13 |     def __call__(
14 |         self, num_annotate: int, data_manager: DataManager, model_manager: ModelManager[Any, Any]
15 |     ) -> List[int]:
16 |         """
17 |         Identify samples for labelling based on user defined scoring and sampling function.
18 | 
19 |         :param num_annotate: number of samples to annotate
20 |         :param data_manager: A pyrelational data manager
21 |             which keeps track of what has been labelled and creates data loaders for
22 |             active learning
23 |         :param model_manager: A pyrelational model manager
24 |             which wraps a user defined ML model to handle instantiation, training, testing,
25 |             as well as uncertainty quantification
26 | 
27 |         :return: list of indices to annotate
28 |         """
29 |         output = self.train_and_infer(data_manager=data_manager, model_manager=model_manager)
30 |         scores = self.scorer(output)
31 |         if scores.shape[0] != 1:
32 |             scores = scores.squeeze(-1)
33 |         return self.sampler(scores, data_manager.u_indices, num_annotate)
34 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/thompson_sampling_strategy.py:
--------------------------------------------------------------------------------
 1 | """Thomas Sampling Strategy for Regression."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import ThompsonSampling
 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
 6 | 
 7 | 
 8 | class ThompsonSamplingStrategy(RegressionStrategy):
 9 |     """Implements Thompson Sampling Strategy.
10 | 
11 |     Unlabelled samples are scored and queried based on the thompson sampling scorer.
12 |     """
13 | 
14 |     def __init__(self, axis: int = 0):
15 |         """Initialize the strategy with the thompson sampling scorer and a deterministic scorer for regression."""
16 |         super().__init__(ThompsonSampling(axis=axis), TopKSampler())
17 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/upper_confidence_bound_strategy.py:
--------------------------------------------------------------------------------
 1 | """Upper Confidence Bound Strategy."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import UpperConfidenceBound
 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
 6 | 
 7 | 
 8 | class UpperConfidenceBoundStrategy(RegressionStrategy):
 9 |     """Implements Upper Confidence Bound Strategy.
10 | 
11 |     Unlabelled samples are scored and queried based on the UCB scorer.
12 |     """
13 | 
14 |     def __init__(self, kappa: float = 1.0, axis: int = 0):
15 |         """Initialize the strategy with the UCB scorer and a deterministic scorer for regression.
16 | 
17 |         :param kappa: trade-off parameter between exploitation and exploration
18 |         """
19 |         super().__init__(UpperConfidenceBound(kappa=kappa, axis=axis), TopKSampler())
20 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/regression/variance_reduction_strategy.py:
--------------------------------------------------------------------------------
 1 | """Least Confidence Strategy for regression tasks."""
 2 | 
 3 | from pyrelational.batch_mode_samplers import TopKSampler
 4 | from pyrelational.informativeness import StandardDeviation
 5 | from pyrelational.strategies.regression.regression_strategy import RegressionStrategy
 6 | 
 7 | 
 8 | class VarianceReductionStrategy(RegressionStrategy):
 9 |     """Implements Least Confidence Strategy.
10 | 
11 |     Unlabelled samples are queried based on their predicted variance by the model.
12 |     """
13 | 
14 |     def __init__(self, axis: int = 0):
15 |         """Initialize the strategy with the least confidence scorer and a deterministic scorer for regression."""
16 |         super().__init__(StandardDeviation(axis=axis), TopKSampler())
17 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/task_agnostic/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | from pyrelational.strategies.task_agnostic.random_acquisition_strategy import (
 3 |     RandomAcquisitionStrategy,
 4 | )
 5 | from pyrelational.strategies.task_agnostic.relative_distance_strategy import (
 6 |     RelativeDistanceStrategy,
 7 | )
 8 | from pyrelational.strategies.task_agnostic.representative_sampling_strategy import (
 9 |     RepresentativeSamplingStrategy,
10 | )
11 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/task_agnostic/random_acquisition_strategy.py:
--------------------------------------------------------------------------------
 1 | """Defines and implements a random acquisition active learning strategy."""
 2 | 
 3 | from typing import List
 4 | 
 5 | import numpy as np
 6 | 
 7 | from pyrelational.data_managers import DataManager
 8 | from pyrelational.strategies.abstract_strategy import Strategy
 9 | 
10 | 
11 | class RandomAcquisitionStrategy(Strategy):
12 |     """Implements RandomAcquisition whereby random samples from unlabelled set are chosen at each step."""
13 | 
14 |     def __init__(self) -> None:
15 |         """Override init method to do nothing. This strategy does not require any initialization."""
16 |         pass
17 | 
18 |     def __call__(self, num_annotate: int, data_manager: DataManager) -> List[int]:
19 |         """
20 |         Identify samples for labelling based on random sampling.
21 | 
22 |         :param num_annotate: number of samples to annotate
23 |         :param data_manager: A pyrelational data manager
24 |             which keeps track of what has been labelled and creates data loaders for
25 |             active learning
26 | 
27 |         :return: list of indices to annotate
28 |         """
29 |         num_annotate = min(num_annotate, len(data_manager.u_indices))
30 |         ret: List[int] = np.random.choice(data_manager.u_indices, size=num_annotate, replace=False).tolist()
31 |         return ret
32 | 


--------------------------------------------------------------------------------
/pyrelational/strategies/task_agnostic/relative_distance_strategy.py:
--------------------------------------------------------------------------------
 1 | """Relative distance based active learning strategy."""
 2 | 
 3 | from typing import List
 4 | 
 5 | from pyrelational.batch_mode_samplers import TopKSampler
 6 | from pyrelational.data_managers import DataManager
 7 | from pyrelational.informativeness import RelativeDistanceScorer
 8 | from pyrelational.strategies.abstract_strategy import Strategy
 9 | 
10 | 
11 | class RelativeDistanceStrategy(Strategy):
12 |     """Diversity sampling based active learning strategy."""
13 | 
14 |     scorer: RelativeDistanceScorer
15 | 
16 |     def __init__(self, metric: str = "euclidean"):
17 |         """Initialise the strategy with a distance metric.
18 | 
19 |         :param metric: Name of distance metric to use. This should be supported by scikit-learn
20 |             pairwise_distances function.
21 |         """
22 |         self.metric = metric
23 |         super().__init__(RelativeDistanceScorer(metric=metric), TopKSampler())
24 | 
25 |     def __call__(self, num_annotate: int, data_manager: DataManager) -> List[int]:
26 |         """Identify samples which need to be labelled.
27 | 
28 |         :param num_annotate: number of samples to annotate
29 |         :param data_manager: A pyrelational data manager
30 |             which keeps track of what has been labelled and creates data loaders for
31 |             active learning
32 | 
33 |         :return: list of indices to annotate
34 |         """
35 |         scores = self.scorer(data_manager.get_unlabelled_loader(), data_manager.get_labelled_loader())
36 |         return self.sampler(scores, data_manager.u_indices, num_annotate)
37 | 


--------------------------------------------------------------------------------
/pyrelational/types.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Sized, TypeVar
 3 | 
 4 | from torch.utils.data import Dataset
 5 | 
 6 | T = TypeVar("T")
 7 | 
 8 | 
 9 | class SizedDataset(Dataset[T], Sized, ABC): ...
10 | 


--------------------------------------------------------------------------------
/pyrelational/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.0.2"
2 | 


--------------------------------------------------------------------------------
/requirements/base_requirements.txt:
--------------------------------------------------------------------------------
 1 | lightning>=1.8.6, <2.5.0
 2 | numpy<2.0.0
 3 | openpyxl>=3.0.9
 4 | pandas>=1.3.0, <2.2.3
 5 | pyreadr>=0.4.4
 6 | requests==2.32.3
 7 | scikit-learn>=1.5.1, <1.5.2
 8 | tabulate>=0.7.0, <=0.9.0
 9 | torch>=1.9.0, <2.3.0
10 | torchvision>=0.10.0, <0.18.0
11 | xlrd>=2.0.1
12 | rdkit==2023.9.5
13 | 


--------------------------------------------------------------------------------
/requirements/dev_requirements.txt:
--------------------------------------------------------------------------------
 1 | black==24.4.2
 2 | flake8==7.1.1
 3 | flake8-bugbear==24.4.26
 4 | isort==5.13.2
 5 | mypy==1.11.2
 6 | parameterized==0.9.0
 7 | pre-commit==3.8.0
 8 | protobuf>=3.19.0
 9 | pytest==8.2.2
10 | pytest-cov
11 | setuptools>=59.5.0
12 | -r base_requirements.txt
13 | 


--------------------------------------------------------------------------------
/requirements/doc_requirements.txt:
--------------------------------------------------------------------------------
1 | -r dev_requirements.txt
2 | sphinx
3 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | """
 6 | pip install -e .
 7 | """
 8 | 
 9 | setup_requires = ["pytest-runner"]
10 | tests_require = ["pytest", "pytest-cov", "mock"]
11 | 
12 | with open("requirements/base_requirements.txt", "r") as req:
13 |     install_requires = [line.strip() for line in req if line.strip()]
14 | 
15 | with open("README.md", "r", encoding="utf-8") as fh:
16 |     long_description = fh.read()
17 | 
18 | version: Dict[str, str] = {}
19 | with open("pyrelational/version.py") as fp:
20 |     exec(fp.read(), version)
21 | 
22 | setup(
23 |     name="pyrelational",
24 |     description="Python tool box for quickly implementing active learning strategies",
25 |     author="Relation Therapeutics",
26 |     author_email="software@relationrx.com",
27 |     long_description=long_description,
28 |     long_description_content_type="text/markdown",
29 |     url="https://github.com/RelationRx/pyrelational",
30 |     packages=find_packages(),
31 |     version=version["__version__"],
32 |     setup_requires=setup_requires,
33 |     tests_require=tests_require,
34 |     install_requires=install_requires,
35 |     python_requires=">=3.9",
36 | )
37 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data_managers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/data_managers/__init__.py


--------------------------------------------------------------------------------
/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/datasets/__init__.py


--------------------------------------------------------------------------------
/tests/datasets/test_benchmark_datamanager.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for benchmark datamanager functions
 2 | """
 3 | 
 4 | from unittest import TestCase
 5 | 
 6 | from pyrelational.datasets import BreastCancerDataset, DiabetesDataset
 7 | from pyrelational.datasets.benchmark_datamanager import (
 8 |     create_classification_cold_start,
 9 |     create_regression_cold_start,
10 |     create_warm_start,
11 | )
12 | 
13 | 
14 | class TestBenchmarkDataManager(TestCase):
15 |     """Class containing unit tests for benchmark datamanager creation."""
16 | 
17 |     def test_create_warm_start_classification(self) -> None:
18 |         """Check shape correctness of dataset."""
19 |         dataset = BreastCancerDataset()
20 |         train_indices = list(dataset.data_splits[0][0])
21 |         test_indices = list(dataset.data_splits[0][1])
22 |         dm = create_warm_start(dataset, train_indices=train_indices, test_indices=test_indices)
23 |         self.assertEqual(len(dm), 569)
24 | 
25 |     def test_create_warm_start_regression(self) -> None:
26 |         """Check shape correctness of dataset."""
27 |         dataset = DiabetesDataset()
28 |         train_indices = list(dataset.data_splits[0][0])
29 |         test_indices = list(dataset.data_splits[0][1])
30 |         dm = create_warm_start(dataset, train_indices=train_indices, test_indices=test_indices)
31 |         self.assertEqual(len(dm), 442)
32 | 
33 |     def test_create_classification_cold_start(self) -> None:
34 |         """Check shape correctness of dataset."""
35 |         dataset = BreastCancerDataset()
36 |         train_indices = list(dataset.data_splits[0][0])
37 |         test_indices = list(dataset.data_splits[0][1])
38 |         dm = create_classification_cold_start(dataset, train_indices=train_indices, test_indices=test_indices)
39 |         self.assertEqual(len(dm), 569)
40 |         self.assertEqual(len(dm.l_indices), 2)
41 | 
42 |     def test_create_regression_cold_start(self) -> None:
43 |         """Check shape correctness of dataset."""
44 |         dataset = DiabetesDataset()
45 |         train_indices = list(dataset.data_splits[0][0])
46 |         test_indices = list(dataset.data_splits[0][1])
47 |         dm = create_regression_cold_start(dataset, train_indices=train_indices, test_indices=test_indices)
48 |         self.assertEqual(len(dm), 442)
49 |         self.assertEqual(len(dm.l_indices), 2)
50 | 


--------------------------------------------------------------------------------
/tests/datasets/test_uci_datasets.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for uci dataset downloader
 2 | """
 3 | 
 4 | import os
 5 | import shutil
 6 | from unittest import TestCase
 7 | 
 8 | from parameterized import parameterized_class
 9 | 
10 | from pyrelational.datasets import UCIDatasets
11 | 
12 | 
13 | @parameterized_class([{"data_name": k} for k in UCIDatasets.datasets.keys()])
14 | class TestUCIBenchmarkDatasets(TestCase):
15 |     """Class containing unit tests on UCI benchmark datasets."""
16 | 
17 |     def setUp(self) -> None:
18 |         """Set up class."""
19 |         self.dataset = UCIDatasets(self.data_name, data_dir="test_data/", n_splits=10)
20 | 
21 |     def test_number_splits(self) -> None:
22 |         """Check number of splits."""
23 |         dataset = UCIDatasets("glass", data_dir="test_data/", n_splits=10)
24 |         self.assertEqual(dataset.n_splits, 10)
25 |         self.assertEqual(len(dataset.data_splits), 10)
26 | 
27 |     def test_split_size(self):
28 |         """Check size of train and test splits."""
29 |         split = self.dataset.get_split(train=True)
30 |         self.assertEqual(len(split), len(self.dataset.data_splits[0][0]))
31 | 
32 |         split = self.dataset.get_split(train=False)
33 |         self.assertEqual(len(split), len(self.dataset.data_splits[0][1]))
34 | 
35 |     def test_full_split_length(self) -> None:
36 |         """Check full split length."""
37 |         split = self.dataset.get_full_split()
38 |         self.assertEqual(len(split), len(self.dataset.data))
39 | 
40 |     def test_get_simple_data(self) -> None:
41 |         """Check size of returned simple dataset."""
42 |         x, y = self.dataset.get_data()
43 |         self.assertEqual(len(x), len(self.dataset.data))
44 |         self.assertEqual(len(y), len(self.dataset.data))
45 | 
46 |     def tearDown(self) -> None:
47 |         """Tear down class."""
48 |         if os.path.exists("test_data/"):
49 |             shutil.rmtree("test_data")
50 | 


--------------------------------------------------------------------------------
/tests/informativeness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/informativeness/__init__.py


--------------------------------------------------------------------------------
/tests/model_managers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/model_managers/__init__.py


--------------------------------------------------------------------------------
/tests/model_managers/test_ensemble_model_manager.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from pyrelational.model_managers.ensemble_model_manager import (
 7 |     LightningEnsembleModelManager,
 8 | )
 9 | from tests.test_utils import BreastCancerClassifier, get_classification_dataset
10 | 
11 | 
12 | class TestEnsembleEstimator(TestCase):
13 |     """Class containing unit tests for ensemble pyrelational model."""
14 | 
15 |     def setUp(self) -> None:
16 |         """Set up shared attributes"""
17 |         self.num_estimators = 4
18 |         self.model = LightningEnsembleModelManager(
19 |             BreastCancerClassifier, {}, {"epochs": 1}, n_estimators=self.num_estimators
20 |         )
21 |         self.dataset = get_classification_dataset()
22 |         self.train_loader = self.dataset.get_train_loader()
23 |         self.val_loader = self.dataset.get_validation_loader()
24 | 
25 |     def test_instantiation(self) -> None:
26 |         """Check attributes at instantiation."""
27 |         self.assertEqual(self.model.__class__.__name__, "LightningEnsembleModelManager")
28 |         self.assertIsNone(self.model._current_model)
29 |         self.assertIsInstance(self.model.trainer_config, dict)
30 |         self.assertIsInstance(self.model.model_config, dict)
31 | 
32 |     def test_fail_on_test_without_train(self) -> None:
33 |         """Check error is raised when testing without training first."""
34 |         with pytest.raises(ValueError) as err:
35 |             self.model.test(self.val_loader)
36 |             self.assertEqual(
37 |                 str(err.value), "No current model, call 'train(train_loader, valid_loader)' to train the model first"
38 |             )
39 | 
40 |     def test_prediction(self) -> None:
41 |         """Check dimension match with number of estimators or dataset size."""
42 |         self.model.train(self.train_loader)
43 |         self.assertEqual(len(self.model._current_model), self.num_estimators)
44 | 
45 |         prediction = self.model(self.val_loader)
46 |         self.assertEqual(prediction.size(0), self.num_estimators)
47 |         self.assertEqual(prediction.size(1), len(self.dataset.validation_indices))
48 |         self.assertIsInstance(prediction, torch.Tensor)
49 |         self.assertIsInstance(self.model.test(self.val_loader), dict)
50 | 


--------------------------------------------------------------------------------
/tests/model_managers/test_model_managers.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | from unittest import TestCase
 3 | 
 4 | import torch
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from pyrelational.model_managers import (
 8 |     LightningEnsembleModelManager,
 9 |     LightningMCDropoutModelManager,
10 |     LightningModelManager,
11 | )
12 | from tests.test_utils import DiabetesDataset, DiabetesRegressionModel
13 | 
14 | 
15 | class TestModelManager(TestCase):
16 |     """Class containing unit tests for pyrelational models."""
17 | 
18 |     def test_lightning_model(self) -> None:
19 |         """
20 |         Check that
21 |         1) model is stored after training
22 |         2) output of test loop is a dictionary
23 |         3) shape of tensor output of __call__
24 |         """
25 |         train_loader, valid_loader, test_loader = get_loaders()
26 |         model = LightningModelManager(DiabetesRegressionModel, {}, {"epochs": 3})
27 |         model.train(train_loader, valid_loader)
28 |         self.assertIsNotNone(model._current_model)
29 |         self.assertIsInstance(model.test(test_loader), dict)
30 |         self.assertEqual(model(test_loader).size(0), len(test_loader.dataset))
31 | 
32 |     def test_early_stopping_in_trainer_callbacks(self) -> None:
33 |         """Check that EarlyStopping is one of the callbacks in a pyrelational LightningModelManager."""
34 |         model = LightningModelManager(
35 |             DiabetesRegressionModel, {}, {"epochs": 3, "use_early_stopping": True, "patience": 10}
36 |         )
37 |         trainer, _ = model.init_trainer()
38 |         self.assertTrue(any(["EarlyStopping" in str(cb) for cb in trainer.callbacks]))
39 | 
40 | 
41 | def get_loaders() -> Tuple[DataLoader, DataLoader, DataLoader]:
42 |     """Create train/val/test dataloaders from sklearn diabetes dataset."""
43 |     ds = DiabetesDataset()
44 |     train_ds, valid_ds, test_ds = torch.utils.data.random_split(ds, [350, 50, 42])
45 | 
46 |     train_loader = DataLoader(train_ds, batch_size=10)
47 |     valid_loader = DataLoader(valid_ds, batch_size=10)
48 |     test_loader = DataLoader(test_ds, batch_size=10)
49 |     return train_loader, valid_loader, test_loader
50 | 


--------------------------------------------------------------------------------
/tests/oracles/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/oracles/__init__.py


--------------------------------------------------------------------------------
/tests/oracles/test_oracles.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from pyrelational.oracles.benchmark_oracle import BenchmarkOracle
 4 | from tests.test_utils import get_classification_dataset
 5 | 
 6 | 
 7 | class TestOracle(TestCase):
 8 |     """Class containing unit tests for oracles"""
 9 | 
10 |     def setUp(self) -> None:
11 |         """Set up datamager."""
12 |         self.datamanager = get_classification_dataset()
13 | 
14 |     def test_update_annotations(self) -> None:
15 |         """Check update_annotations method updates unlabelled and labelled sets."""
16 |         random_u_sindex = self.datamanager.u_indices[0]
17 |         len_dm_l = len(self.datamanager.l_indices)
18 |         len_dm_u = len(self.datamanager.u_indices)
19 | 
20 |         BenchmarkOracle.update_annotations(self.datamanager, [random_u_sindex])
21 |         self.assertIn(random_u_sindex, self.datamanager.l_indices)
22 |         self.assertGreater(len(self.datamanager.l_indices), len_dm_l)
23 |         self.assertGreater(len_dm_u, len(self.datamanager.u_indices))
24 | 
25 |     def test_query_target_value(self) -> None:
26 |         """Check query target value of benchmark oracle return correct values."""
27 |         oracle = BenchmarkOracle()
28 |         value = oracle.query_target_value(self.datamanager, 0)
29 |         self.assertEqual(value, self.datamanager[0][-1])
30 | 
31 |     def test_update_target_value(self) -> None:
32 |         """Check update_target_value method updates dataset correctly."""
33 |         BenchmarkOracle.update_target_value(self.datamanager, 0, 42)
34 |         self.assertEqual(self.datamanager[0][-1], 42)
35 | 
36 |     def test_update_target_values(self) -> None:
37 |         """Test that update_target_values method change all values in dataset."""
38 |         ixs, vals = [0, 1, 2], [42, 42, 42]
39 |         BenchmarkOracle.update_target_values(self.datamanager, ixs, vals)
40 |         self.assertEqual([self.datamanager[i][-1] for i in ixs], vals)
41 | 


--------------------------------------------------------------------------------
/tests/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/pipeline/__init__.py


--------------------------------------------------------------------------------
/tests/samplers/test_samplers.py:
--------------------------------------------------------------------------------
 1 | """Tests for the samplers module.""" ""
 2 | from unittest import TestCase
 3 | 
 4 | import torch
 5 | 
 6 | from pyrelational.batch_mode_samplers import ProbabilisticSampler, TopKSampler
 7 | 
 8 | 
 9 | class TestSamplers(TestCase):
10 |     """Collection of tests for samplers."""
11 | 
12 |     def test_deterministic_sampler(self) -> None:
13 |         """Test deterministic sampler."""
14 |         sampler = TopKSampler()
15 |         query = sampler(torch.tensor([0.1, 3.0, 2.1]), [1, 2, 3], 1)
16 |         self.assertEqual(len(query), 1)
17 |         self.assertEqual(query, [2])
18 | 
19 |     def test_probabilistic_sampler(self) -> None:
20 |         """Test probabilistic sampler."""
21 |         sampler = ProbabilisticSampler()
22 |         query = sampler(torch.tensor([0.1, 0.2, 0.7]), [1, 2, 3], 1)
23 |         self.assertEqual(len(query), 1)
24 | 


--------------------------------------------------------------------------------
/tests/strategies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RelationRx/pyrelational/fb83cff64241344396617d3b7712f69c852133b4/tests/strategies/__init__.py


--------------------------------------------------------------------------------
/tests/strategies/_agnostic_strategy_test_cases.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cluster import AgglomerativeClustering
 2 | 
 3 | from pyrelational.strategies.task_agnostic import (
 4 |     RandomAcquisitionStrategy,
 5 |     RelativeDistanceStrategy,
 6 |     RepresentativeSamplingStrategy,
 7 | )
 8 | 
 9 | TASK_AGNOSTIC_TEST_CASES = [
10 |     {"task_type": "regression", "strategy_class": RandomAcquisitionStrategy, "strategy_kwargs": {}},
11 |     {"task_type": "regression", "strategy_class": RelativeDistanceStrategy, "strategy_kwargs": {}},
12 |     {
13 |         "task_type": "regression",
14 |         "strategy_class": RelativeDistanceStrategy,
15 |         "strategy_kwargs": {"metric": "cosine"},
16 |     },
17 |     {
18 |         "task_type": "regression",
19 |         "strategy_class": RepresentativeSamplingStrategy,
20 |         "strategy_kwargs": {"clustering_method": "AffinityPropagation"},
21 |     },
22 |     {
23 |         "task_type": "regression",
24 |         "strategy_class": RepresentativeSamplingStrategy,
25 |         "strategy_kwargs": {"clustering_method": AgglomerativeClustering(n_clusters=10)},
26 |     },
27 | ]
28 | 


--------------------------------------------------------------------------------
/tests/strategies/_classification_strategy_test_cases.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.strategies.classification import (
 2 |     EntropyClassificationStrategy,
 3 |     LeastConfidenceStrategy,
 4 |     MarginalConfidenceStrategy,
 5 |     RatioConfidenceStrategy,
 6 | )
 7 | 
 8 | CLASSIFICATION_TEST_CASES = [
 9 |     {
10 |         "task_type": "classification",
11 |         "strategy_class": EntropyClassificationStrategy,
12 |         "strategy_kwargs": {},
13 |     },
14 |     {
15 |         "task_type": "classification",
16 |         "strategy_class": LeastConfidenceStrategy,
17 |         "strategy_kwargs": {},
18 |     },
19 |     {
20 |         "task_type": "classification",
21 |         "strategy_class": MarginalConfidenceStrategy,
22 |         "strategy_kwargs": {},
23 |     },
24 |     {
25 |         "task_type": "classification",
26 |         "strategy_class": RatioConfidenceStrategy,
27 |         "strategy_kwargs": {},
28 |     },
29 | ]
30 | 


--------------------------------------------------------------------------------
/tests/strategies/_regression_strategy_test_cases.py:
--------------------------------------------------------------------------------
 1 | from pyrelational.strategies.regression import (
 2 |     BALDStrategy,
 3 |     ExpectedImprovementStrategy,
 4 |     GreedyStrategy,
 5 |     SoftBALDStrategy,
 6 |     ThompsonSamplingStrategy,
 7 |     UpperConfidenceBoundStrategy,
 8 |     VarianceReductionStrategy,
 9 | )
10 | 
11 | REGRESSION_TEST_CASES = [
12 |     {"task_type": "regression", "strategy_class": BALDStrategy, "strategy_kwargs": {}},
13 |     {"task_type": "regression", "strategy_class": VarianceReductionStrategy, "strategy_kwargs": {}},
14 |     {
15 |         "task_type": "regression",
16 |         "strategy_class": ExpectedImprovementStrategy,
17 |         "strategy_kwargs": {},
18 |     },
19 |     {
20 |         "task_type": "regression",
21 |         "strategy_class": GreedyStrategy,
22 |         "strategy_kwargs": {},
23 |     },
24 |     {
25 |         "task_type": "regression",
26 |         "strategy_class": SoftBALDStrategy,
27 |         "strategy_kwargs": {},
28 |     },
29 |     {
30 |         "task_type": "regression",
31 |         "strategy_class": ThompsonSamplingStrategy,
32 |         "strategy_kwargs": {},
33 |     },
34 |     {
35 |         "task_type": "regression",
36 |         "strategy_class": UpperConfidenceBoundStrategy,
37 |         "strategy_kwargs": {"kappa": 0.42},
38 |     },
39 | ]
40 | 


--------------------------------------------------------------------------------
/tests/strategies/test_strategies.py:
--------------------------------------------------------------------------------
 1 | """Unit tests for strategies.""" ""
 2 | from typing import Any, Dict, Type
 3 | from unittest import TestCase
 4 | 
 5 | from parameterized import parameterized_class
 6 | 
 7 | from pyrelational.model_managers.mcdropout_model_manager import (
 8 |     LightningMCDropoutModelManager,
 9 | )
10 | from pyrelational.strategies import Strategy
11 | from tests.strategies._agnostic_strategy_test_cases import TASK_AGNOSTIC_TEST_CASES
12 | from tests.strategies._classification_strategy_test_cases import (
13 |     CLASSIFICATION_TEST_CASES,
14 | )
15 | from tests.strategies._regression_strategy_test_cases import REGRESSION_TEST_CASES
16 | from tests.test_utils import (
17 |     BreastCancerClassifier,
18 |     DiabetesRegressionModel,
19 |     get_classification_dataset,
20 |     get_regression_dataset,
21 | )
22 | 
23 | 
24 | @parameterized_class(TASK_AGNOSTIC_TEST_CASES + CLASSIFICATION_TEST_CASES + REGRESSION_TEST_CASES)
25 | class TestStrategies(TestCase):
26 |     """Class containing unit test of strategies."""
27 | 
28 |     task_type: str
29 |     strategy_class: Type[Strategy]
30 |     strategy_kwargs: Dict[str, Any]
31 | 
32 |     def setUp(self) -> None:
33 |         """Define model and datamanager."""
34 |         if self.task_type == "regression":
35 |             model_class = DiabetesRegressionModel
36 |             self.datamanager = get_regression_dataset()
37 |         else:
38 |             model_class = BreastCancerClassifier
39 |             self.datamanager = get_classification_dataset()
40 |         self.model_manager = LightningMCDropoutModelManager(
41 |             model_class,
42 |             {"ensemble_size": 3},
43 |             {"epochs": 5},
44 |         )
45 |         self.strategy = self.strategy_class(**self.strategy_kwargs)
46 | 
47 |     def test_suggest(self) -> None:
48 |         """Test suggest return the required number of sample indices."""
49 |         out = self.strategy.suggest(num_annotate=5, model_manager=self.model_manager, data_manager=self.datamanager)
50 |         self.assertEqual(len(out), 5)
51 | 
52 |     def test_str_print(self) -> None:
53 |         """Check str returns expected string."""
54 |         self.assertEqual(str(self.strategy), f"Strategy: {self.strategy.__class__.__name__}")
55 | 


--------------------------------------------------------------------------------