├── .github └── workflows │ ├── build_docs.yml │ ├── build_wheels.yml │ ├── cuda │ ├── Linux-env.sh │ └── Linux.sh │ ├── cuda_test_and_codecov.sh │ ├── linting.yml │ └── testing.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── codecov.yml ├── doc ├── .gitignore ├── Makefile ├── api_reference │ ├── center_selector.rst │ ├── gsc_losses.rst │ ├── hopt.rst │ ├── index.rst │ ├── kernels.rst │ ├── mmv_ops.rst │ ├── models.rst │ ├── optimization.rst │ ├── options.rst │ ├── outofcore.rst │ ├── preconditioner.rst │ └── sparse.rst ├── conf.py ├── doc-requirements.txt ├── examples │ ├── custom_kernels.ipynb │ ├── examples.rst │ ├── falkon_cv.ipynb │ ├── falkon_mnist.ipynb │ ├── falkon_regression_tutorial.ipynb │ ├── hyperopt.ipynb │ └── logistic_falkon.ipynb ├── get_started.rst ├── index.rst └── install.rst ├── falkon ├── VERSION ├── __init__.py ├── benchmarks │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── cmdline-args.md │ ├── common │ │ ├── __init__.py │ │ ├── benchmark_utils.py │ │ ├── create_weather_dataset.py │ │ ├── datasets.py │ │ ├── error_metrics.py │ │ └── summary.py │ ├── falkon_benchmarks │ │ ├── benchmark_flights.sh │ │ ├── benchmark_flights_cls.sh │ │ ├── benchmark_flk_1gpu.sh │ │ ├── benchmark_higgs.sh │ │ ├── benchmark_millionsongs.sh │ │ ├── benchmark_mnist.sh │ │ ├── benchmark_runner.py │ │ ├── benchmark_susy.sh │ │ ├── benchmark_taxi.sh │ │ ├── benchmark_timit.sh │ │ └── benchmark_yelp.sh │ ├── models │ │ ├── __init__.py │ │ ├── gpflow_model.py │ │ ├── gpytorch_sgpr.py │ │ └── gpytorch_variational_models.py │ ├── run_hgrad_benchmark.sh │ └── timing_benchmarks │ │ ├── lauum_timings.py │ │ ├── lauum_timings.sh │ │ ├── mm_timings.py │ │ ├── mmv_timings.py │ │ ├── mmv_timings.sh │ │ ├── potrf_timings.py │ │ ├── potrf_timings.sh │ │ ├── time_improvements.py │ │ └── time_improvements.sh ├── c_ext │ ├── __init__.py │ ├── _backend.py │ ├── falkon.cpp │ ├── falkon.h │ └── ops │ │ ├── autograd │ │ └── ag_square_norm.cpp │ │ ├── copy_transpose.cpp │ │ ├── copy_transpose.h │ │ ├── copy_triang.cpp │ │ ├── copy_triang.h │ │ ├── cpu │ │ ├── cpu_copy_triang.cpp │ │ ├── cpu_mul_triang.cpp │ │ ├── cpu_potrf.cpp │ │ ├── cpu_sparse_vector_ops.cpp │ │ ├── cpu_square_norm.cpp │ │ └── cpu_vec_mul_triang.cpp │ │ ├── csr2dense.cpp │ │ ├── csr2dense.h │ │ ├── cublas_bindings.cpp │ │ ├── cublas_bindings.h │ │ ├── cuda │ │ ├── cublas_bindings.cu │ │ ├── cublas_bindings.h │ │ ├── cuda_bindings.cu │ │ ├── cuda_copy_transpose.cu │ │ ├── cuda_copy_triang.cu │ │ ├── cuda_csr2dense.cu │ │ ├── cuda_helpers.cuh │ │ ├── cuda_lauum.cu │ │ ├── cuda_mul_triang.cu │ │ ├── cuda_spspmm.cu │ │ ├── cuda_square_norm.cu │ │ ├── cuda_vec_mul_triang.cu │ │ ├── cusolver_bindings.cu │ │ ├── cusolver_bindings.h │ │ └── parallel_potrf.cu │ │ ├── cuda_bindings.cpp │ │ ├── cuda_bindings.h │ │ ├── cusolver_bindings.cpp │ │ ├── cusolver_bindings.h │ │ ├── helpers.h │ │ ├── lauum.cpp │ │ ├── lauum.h │ │ ├── mul_triang.cpp │ │ ├── mul_triang.h │ │ ├── ops.h │ │ ├── potrf.cpp │ │ ├── potrf.h │ │ ├── sparse_vector_ops.cpp │ │ ├── sparse_vector_ops.h │ │ ├── spspmm.cpp │ │ ├── spspmm.h │ │ ├── square_norm.cpp │ │ ├── square_norm.h │ │ ├── vec_mul_triang.cpp │ │ └── vec_mul_triang.h ├── center_selection.py ├── gsc_losses.py ├── hopt │ ├── README.md │ ├── __init__.py │ ├── benchmarking │ │ ├── __init__.py │ │ ├── benchmark_cli.py │ │ ├── runner_gd.py │ │ └── runner_gridsearch.py │ ├── objectives │ │ ├── __init__.py │ │ ├── exact_objectives │ │ │ ├── __init__.py │ │ │ ├── compreg.py │ │ │ ├── gcv.py │ │ │ ├── holdout.py │ │ │ ├── loocv.py │ │ │ ├── new_compreg.py │ │ │ ├── sgpr.py │ │ │ └── utils.py │ │ ├── objectives.py │ │ ├── stoch_objectives │ │ │ ├── __init__.py │ │ │ ├── stoch_new_compreg.py │ │ │ └── utils.py │ │ └── transforms.py │ ├── optimization │ │ ├── __init__.py │ │ ├── gd_train.py │ │ ├── grid_search.py │ │ ├── models.py │ │ └── reporting.py │ └── utils.py ├── kernels │ ├── __init__.py │ ├── diff_kernel.py │ ├── distance_kernel.py │ ├── dot_prod_kernel.py │ ├── keops_helpers.py │ ├── kernel.py │ └── precomputed_kernel.py ├── la_helpers │ ├── __init__.py │ ├── cpu_trsm.py │ ├── cuda_trsm.py │ └── wrapper.py ├── mkl_bindings │ ├── __init__.py │ └── mkl_bind.py ├── mmv_ops │ ├── __init__.py │ ├── fmm.py │ ├── fmmv.py │ ├── fmmv_incore.py │ ├── keops.py │ └── utils.py ├── models │ ├── __init__.py │ ├── falkon.py │ ├── incore_falkon.py │ ├── logistic_falkon.py │ └── model_utils.py ├── ooc_ops │ ├── __init__.py │ ├── ooc_lauum.py │ ├── ooc_potrf.py │ ├── ooc_utils.py │ └── parallel_lauum.py ├── optim │ ├── __init__.py │ └── conjgrad.py ├── options.py ├── preconditioner │ ├── __init__.py │ ├── flk_preconditioner.py │ ├── logistic_preconditioner.py │ ├── pc_utils.py │ └── preconditioner.py ├── sparse │ ├── __init__.py │ ├── sparse_ops.py │ └── sparse_tensor.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── gen_random.py │ ├── helpers.py │ ├── naive_kernels.py │ ├── test_chol_prec.py │ ├── test_conjgrad.py │ ├── test_custom_kernel.py │ ├── test_device_copy.py │ ├── test_dim_selectors.py │ ├── test_falkon.py │ ├── test_gsc_losses.py │ ├── test_hopt.py │ ├── test_kernels.py │ ├── test_kernels_sparse.py │ ├── test_logistic_falkon.py │ ├── test_matrix_ops.py │ ├── test_mkl.py │ ├── test_nysel.py │ ├── test_ooc_lauum.py │ ├── test_ooc_potrf.py │ ├── test_sparse.py │ ├── test_stress_multi_core.py │ ├── test_trsm_wrapper.py │ └── test_util.py └── utils │ ├── .gitignore │ ├── __init__.py │ ├── device_copy.py │ ├── devices.py │ ├── fake_queue.py │ ├── helpers.py │ ├── stream_utils.py │ ├── switches.py │ ├── tensor_helpers.py │ ├── threading.py │ └── tictoc.py ├── notebooks ├── Airlines.ipynb ├── BatchKernels.ipynb ├── CreateSmallHiggs.ipynb ├── FalkonRegression.ipynb ├── NycTaxiDataset.ipynb ├── SVM Comparison.ipynb ├── UciDatasets.ipynb └── uci_datasets_download.py ├── pyproject.toml └── setup.py /.github/workflows/build_docs.yml: -------------------------------------------------------------------------------- 1 | name: Build Sphinx Docs 2 | 3 | on: [workflow_dispatch] 4 | 5 | jobs: 6 | 7 | docs: 8 | runs-on: ${{ matrix.os }} 9 | 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest] 13 | python-version: ['3.8'] 14 | torch-version: ['2.2.0'] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | sudo apt install pandoc xvfb 27 | pip install --upgrade setuptools 28 | pip install wheel 29 | 30 | - name: Install PyTorch ${{ matrix.torch-version }} 31 | run: | 32 | pip install torch==${{ matrix.torch-version}}+cpu --extra-index-url https://download.pytorch.org/whl/cpu 33 | 34 | - name: Install main package 35 | run: | 36 | FORCE_ONLY_CPU=1 pip install -e .[doc] 37 | 38 | - name: Build with sphinx 39 | run: | 40 | /sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1400x900x24 -ac +extension GLX +render -noreset 41 | cd doc/ 42 | make clean && make html 43 | shell: 44 | bash 45 | 46 | - name: Upload documentation 47 | uses: peaceiris/actions-gh-pages@v3 48 | with: 49 | github_token: ${{ secrets.GITHUB_TOKEN }} 50 | publish_dir: ./doc/_build/html/ 51 | -------------------------------------------------------------------------------- /.github/workflows/build_wheels.yml: -------------------------------------------------------------------------------- 1 | name: Building Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [created] 7 | branches: [master] 8 | 9 | jobs: 10 | wheel: 11 | runs-on: ${{ matrix.os }} 12 | 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: [ubuntu-20.04] 17 | # support version based on: https://download.pytorch.org/whl/torch/ 18 | python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] 19 | torch-version: [2.0.0, 2.1.0, 2.2.0] 20 | cuda-version: ['cpu', 'cu117', 'cu118', 'cu121'] 21 | exclude: 22 | - torch-version: 2.0.0 23 | cuda-version: 'cu121' 24 | - torch-version: 2.0.0 25 | python-version: '3.12' 26 | 27 | - torch-version: 2.1.0 28 | cuda-version: 'cu117' 29 | - torch-version: 2.1.0 30 | python-version: '3.12' 31 | 32 | - torch-version: 2.2.0 33 | cuda-version: 'cu117' 34 | 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Set up Python ${{ matrix.python-version }} 39 | uses: actions/setup-python@v5 40 | with: 41 | python-version: ${{ matrix.python-version }} 42 | 43 | - name: Upgrade pip 44 | run: | 45 | pip install --upgrade setuptools 46 | pip install ninja 47 | 48 | - name: Free up disk space 49 | if: ${{ runner.os == 'Linux' }} 50 | run: | 51 | sudo rm -rf /usr/share/dotnet 52 | sudo rm -rf /usr/local/lib/android 53 | sudo rm -rf /opt/ghc 54 | sudo rm -rf /opt/hostedtoolcache/CodeQL 55 | 56 | - name: Install CUDA ${{ matrix.cuda-version }} 57 | if: ${{ matrix.cuda-version != 'cpu' }} 58 | run: | 59 | bash .github/workflows/cuda/${{ runner.os }}.sh ${{ matrix.cuda-version }} 60 | 61 | - name: Install PyTorch ${{ matrix.torch-version }}+${{ matrix.cuda-version }} 62 | run: | 63 | pip install numpy scipy 64 | pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/${{ matrix.cuda-version }} 65 | python -c "import torch; print('PyTorch:', torch.__version__)" 66 | python -c "import torch; print('CUDA:', torch.version.cuda)" 67 | python -c "import torch; print('CUDA Available:', torch.cuda.is_available())" 68 | 69 | - name: Install main package for CPU 70 | if: ${{ matrix.cuda-version == 'cpu' }} 71 | run: | 72 | pip install wheel 73 | FORCE_ONLY_CPU=1 pip install --no-build-isolation --editable . 74 | 75 | - name: Install main package for GPU 76 | if: ${{ matrix.cuda-version != 'cpu' }} 77 | run: | 78 | source .github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }} 79 | pip install wheel 80 | pip install --no-build-isolation --editable . 81 | shell: 82 | bash 83 | 84 | - name: Test installation 85 | run: | 86 | python -c "import falkon; print('falkon:', falkon.__version__)" 87 | 88 | - name: Build wheel 89 | run: | 90 | source .github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }} 91 | pip install build 92 | python -m build --no-isolation 93 | shell: 94 | bash 95 | 96 | - name: Upload wheel 97 | uses: actions/upload-artifact@v4 98 | with: 99 | name: torch-${{ matrix.torch-version }}_${{ matrix.cuda-version }} 100 | path: dist/*.whl 101 | -------------------------------------------------------------------------------- /.github/workflows/cuda/Linux-env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Took from https://github.com/pyg-team/pyg-lib/ 4 | 5 | case ${1} in 6 | cu121) 7 | export CUDA_HOME=/usr/local/cuda-12.1 8 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 9 | export PATH=${CUDA_HOME}/bin:${PATH} 10 | export TORCH_CUDA_ARCH_LIST="5.0+PTX;6.0;7.0;7.5;8.0;8.6;9.0" 11 | ;; 12 | cu118) 13 | export CUDA_HOME=/usr/local/cuda-11.8 14 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 15 | export PATH=${CUDA_HOME}/bin:${PATH} 16 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 17 | ;; 18 | cu117) 19 | export CUDA_HOME=/usr/local/cuda-11.7 20 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 21 | export PATH=${CUDA_HOME}/bin:${PATH} 22 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 23 | ;; 24 | cu116) 25 | export CUDA_HOME=/usr/local/cuda-11.6 26 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 27 | export PATH=${CUDA_HOME}/bin:${PATH} 28 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 29 | ;; 30 | cu115) 31 | export CUDA_HOME=/usr/local/cuda-11.5 32 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 33 | export PATH=${CUDA_HOME}/bin:${PATH} 34 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 35 | ;; 36 | cu113) 37 | export CUDA_HOME=/usr/local/cuda-11.3 38 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 39 | export PATH=${CUDA_HOME}/bin:${PATH} 40 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" 41 | ;; 42 | cu102) 43 | export CUDA_HOME=/usr/local/cuda-10.2 44 | export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 45 | export PATH=${CUDA_HOME}/bin:${PATH} 46 | export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" 47 | ;; 48 | *) 49 | ;; 50 | esac 51 | -------------------------------------------------------------------------------- /.github/workflows/cuda/Linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Took from https://github.com/pyg-team/pyg-lib/ 4 | 5 | OS=ubuntu2004 6 | 7 | case ${1} in 8 | cu121) 9 | CUDA=12.1 10 | APT_KEY=${OS}-${CUDA/./-}-local 11 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.1/local_installers 12 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.1-530.30.02-1_amd64.deb 13 | ;; 14 | cu118) 15 | CUDA=11.8 16 | APT_KEY=${OS}-${CUDA/./-}-local 17 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.0-520.61.05-1_amd64.deb 18 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.0/local_installers 19 | ;; 20 | cu117) 21 | CUDA=11.7 22 | APT_KEY=${OS}-${CUDA/./-}-local 23 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.1-515.65.01-1_amd64.deb 24 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.1/local_installers 25 | ;; 26 | cu116) 27 | CUDA=11.6 28 | APT_KEY=${OS}-${CUDA/./-}-local 29 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.2-510.47.03-1_amd64.deb 30 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.2/local_installers 31 | ;; 32 | cu115) 33 | CUDA=11.5 34 | APT_KEY=${OS}-${CUDA/./-}-local 35 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.2-495.29.05-1_amd64.deb 36 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.2/local_installers 37 | ;; 38 | cu113) 39 | CUDA=11.3 40 | APT_KEY=${OS}-${CUDA/./-}-local 41 | FILENAME=cuda-repo-${APT_KEY}_${CUDA}.0-465.19.01-1_amd64.deb 42 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}.0/local_installers 43 | ;; 44 | cu102) 45 | CUDA=10.2 46 | APT_KEY=${CUDA/./-}-local-${CUDA}.89-440.33.01 47 | FILENAME=cuda-repo-${OS}-${APT_KEY}_1.0-1_amd64.deb 48 | URL=https://developer.download.nvidia.com/compute/cuda/${CUDA}/Prod/local_installers 49 | ;; 50 | *) 51 | echo "Unrecognized CUDA_VERSION=${1}" 52 | exit 1 53 | ;; 54 | esac 55 | 56 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-${OS}.pin 57 | sudo mv cuda-${OS}.pin /etc/apt/preferences.d/cuda-repository-pin-600 58 | wget -nv ${URL}/${FILENAME} 59 | sudo dpkg -i ${FILENAME} 60 | 61 | if [ "${1}" = "cu117" ] || [ "${1}" = "cu118" ] || [ "${1}" = "cu121" ]; then 62 | sudo cp /var/cuda-repo-${APT_KEY}/cuda-*-keyring.gpg /usr/share/keyrings/ 63 | else 64 | sudo apt-key add /var/cuda-repo-${APT_KEY}/7fa2af80.pub 65 | fi 66 | 67 | sudo apt-get update 68 | sudo apt-get -y install cuda 69 | 70 | rm -f ${FILENAME} 71 | -------------------------------------------------------------------------------- /.github/workflows/cuda_test_and_codecov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set +euxo 4 | 5 | GIT_ROOT=$(git rev-parse --show-toplevel) 6 | cd "$GIT_ROOT" || exit 3 7 | 8 | pytest --cov-report=term-missing --cov-report=xml:coverage.xml --cov=falkon --cov-config setup.cfg 9 | 10 | echo "$(date) || Uploading test-data to codecov..." 11 | curl -s https://codecov.io/bash | bash -s -- -c -f coverage.xml -t "$CODECOV_TOKEN" 12 | echo "$(date) || Data uploaded." 13 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: [push, pull_request] 4 | 5 | permissions: 6 | contents: read 7 | 8 | jobs: 9 | linting: 10 | runs-on: ubuntu-20.04 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.8" 18 | 19 | - name: Install dependencies 20 | run: pip install ruff==0.0.284 isort==5.12.0 black==23.1.0 21 | 22 | - name: Run isort 23 | run: isort falkon/ --line-length 120 --check --profile black 24 | 25 | - name: Run ruff linter 26 | run: ruff check falkon/ 27 | 28 | - name: Run black linter 29 | run: black falkon/ --line-length 120 --check 30 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | pytest: 7 | runs-on: ${{ matrix.os }} 8 | 9 | strategy: 10 | matrix: 11 | os: [ubuntu-20.04] 12 | python-version: ['3.8'] 13 | torch-version: [2.0.0, 2.2.0] 14 | 15 | steps: 16 | - name: Checkout with submodules 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Free up disk space 25 | if: ${{ runner.os == 'Linux' }} 26 | run: | 27 | sudo rm -rf /usr/share/dotnet 28 | 29 | - name: Setup venv 30 | run: | 31 | python -m venv flk-env 32 | source flk-env/bin/activate 33 | 34 | - name: Install setup dependencies (PyTorch ${{ matrix.torch-version }}+cpu) 35 | run: | 36 | source flk-env/bin/activate 37 | echo "Python is located at $(which python)" 38 | pip install --upgrade pip setuptools wheel ninja 39 | pip install numpy 40 | pip install torch==${{ matrix.torch-version }} --extra-index-url https://download.pytorch.org/whl/cpu 41 | python -c "import torch; print('PyTorch:', torch.__version__)" 42 | python -c "import torch; print('CUDA:', torch.version.cuda)" 43 | python -c "import torch; print('CUDA Available:', torch.cuda.is_available())" 44 | 45 | - name: Install main package 46 | run: | 47 | source flk-env/bin/activate 48 | pip install --no-build-isolation --editable .[test] 49 | 50 | - name: Run test-suite 51 | run: | 52 | source flk-env/bin/activate 53 | pytest --cov-report=term-missing --cov-report=xml:coverage.xml --cov=falkon --cov-config pyproject.toml 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Alessandro Rudi 4 | Copyright (c) 2024 Giacomo Meanti 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include falkon/VERSION 4 | 5 | recursive-include falkon/c_ext *.h *.cpp *.cu *.cuh 6 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | precision: 2 3 | round: down 4 | range: "70...100" 5 | status: 6 | project: 7 | default: 8 | target: auto 9 | threshold: 0.01 10 | patch: false 11 | changes: false 12 | ignore: 13 | - "*/tests/*" 14 | comment: 15 | layout: "header, diff, sunburst, uncovered" 16 | behavior: default 17 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | _build/ 3 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | GITHUB_PAGES_BRANCH = gh-pages 12 | OUTPUTDIR = _build/html 13 | 14 | # Put it first so that "make" without argument is like "make help". 15 | help: 16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 17 | 18 | .PHONY: help Makefile 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | 25 | install: 26 | touch $(OUTPUTDIR)/.nojekyll 27 | ghp-import -m "[skip ci] Update documentation" -b $(GITHUB_PAGES_BRANCH) $(OUTPUTDIR) 28 | git push --force origin $(GITHUB_PAGES_BRANCH) 29 | 30 | -------------------------------------------------------------------------------- /doc/api_reference/center_selector.rst: -------------------------------------------------------------------------------- 1 | falkon.center_selection 2 | ======================= 3 | 4 | .. automodule:: falkon.center_selection 5 | 6 | .. py:currentmodule:: falkon.center_selection 7 | 8 | 9 | CenterSelector 10 | -------------- 11 | 12 | .. autoclass:: falkon.center_selection.CenterSelector 13 | :members: 14 | 15 | 16 | UniformSelector 17 | --------------- 18 | 19 | .. autoclass:: falkon.center_selection.UniformSelector 20 | :members: 21 | :inherited-members: 22 | :show-inheritance: 23 | 24 | 25 | FixedSelector 26 | ------------- 27 | 28 | .. autoclass:: falkon.center_selection.FixedSelector 29 | :members: 30 | :inherited-members: ABC 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /doc/api_reference/gsc_losses.rst: -------------------------------------------------------------------------------- 1 | falkon.gsc_losses 2 | ================= 3 | 4 | .. automodule:: falkon 5 | 6 | .. py:currentmodule:: falkon 7 | 8 | 9 | Loss 10 | ---- 11 | 12 | .. autoclass:: falkon.gsc_losses.Loss 13 | :members: 14 | :special-members: __call__ 15 | 16 | 17 | Logistic loss 18 | ------------- 19 | 20 | .. autoclass:: falkon.gsc_losses.LogisticLoss 21 | :members: 22 | :special-members: __call__ 23 | 24 | 25 | Weighted binary cross entropy loss 26 | ---------------------------------- 27 | 28 | .. autoclass:: falkon.gsc_losses.WeightedCrossEntropyLoss 29 | :members: 30 | :special-members: __call__ 31 | -------------------------------------------------------------------------------- /doc/api_reference/hopt.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | falkon.hopt 4 | ============== 5 | 6 | .. automodule:: falkon.hopt 7 | .. py:currentmodule:: falkon.hopt 8 | 9 | 10 | Objectives 11 | ---------- 12 | 13 | .. autoclass:: falkon.hopt.objectives.objectives.HyperoptObjective 14 | 15 | 16 | Nystrom Complexity Regularization 17 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | 19 | .. autoclass:: falkon.hopt.objectives.NystromCompReg 20 | 21 | 22 | Stochastic Nystrom Computational Regularization 23 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | 25 | .. autoclass:: falkon.hopt.objectives.StochasticNystromCompReg 26 | 27 | 28 | Complexity Regularization 29 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 30 | 31 | .. autoclass:: falkon.hopt.objectives.CompReg 32 | 33 | Generalized Cross Validation 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | .. autoclass:: falkon.hopt.objectives.GCV 37 | 38 | Hold Out Cross Validation 39 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | 41 | .. autoclass:: falkon.hopt.objectives.HoldOut 42 | 43 | 44 | Leave One Out Cross Validation 45 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | 47 | .. autoclass:: falkon.hopt.objectives.LOOCV 48 | 49 | SGPR 50 | ~~~~ 51 | 52 | .. autoclass:: falkon.hopt.objectives.SGPR 53 | -------------------------------------------------------------------------------- /doc/api_reference/index.rst: -------------------------------------------------------------------------------- 1 | .. _api_reference: 2 | 3 | API Reference 4 | ============= 5 | 6 | .. toctree:: 7 | :maxdepth: 3 8 | 9 | models 10 | kernels 11 | options 12 | gsc_losses 13 | preconditioner 14 | optimization 15 | outofcore 16 | mmv_ops 17 | sparse 18 | center_selector 19 | hopt 20 | -------------------------------------------------------------------------------- /doc/api_reference/kernels.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | falkon.kernels 4 | ============== 5 | 6 | .. automodule:: falkon.kernels 7 | .. py:currentmodule:: falkon.kernels 8 | 9 | 10 | Kernel 11 | ------ 12 | 13 | .. autoclass:: falkon.kernels.kernel.Kernel 14 | :members: 15 | :private-members: _decide_mm_impl, _decide_mmv_impl, _decide_dmmv_impl 16 | :special-members: __call__ 17 | 18 | DiffKernel 19 | ---------- 20 | 21 | .. autoclass:: falkon.kernels.diff_kernel.DiffKernel 22 | :members: compute_diff, detach, diff_params 23 | 24 | KeopsKernelMixin 25 | ---------------- 26 | 27 | .. autoclass:: falkon.kernels.keops_helpers.KeopsKernelMixin 28 | :members: keops_mmv, keops_mmv_impl 29 | 30 | 31 | Radial kernels 32 | -------------- 33 | 34 | Gaussian kernel 35 | ~~~~~~~~~~~~~~~ 36 | 37 | .. autoclass:: GaussianKernel 38 | :members: mmv, dmmv 39 | :special-members: __call__ 40 | 41 | Laplacian kernel 42 | ~~~~~~~~~~~~~~~~ 43 | 44 | .. autoclass:: LaplacianKernel 45 | :members: mmv, dmmv 46 | :special-members: __call__ 47 | 48 | Matern kernel 49 | ~~~~~~~~~~~~~ 50 | 51 | .. autoclass:: MaternKernel 52 | :members: mmv, dmmv 53 | :special-members: __call__ 54 | 55 | 56 | Dot-Product kernels 57 | ------------------- 58 | 59 | Polynomial kernel 60 | ~~~~~~~~~~~~~~~~~ 61 | 62 | .. autoclass:: PolynomialKernel 63 | :members: mmv, dmmv 64 | :special-members: __call__ 65 | 66 | Linear kernel 67 | ~~~~~~~~~~~~~ 68 | 69 | .. autoclass:: LinearKernel 70 | :members: mmv, dmmv 71 | :special-members: __call__ 72 | 73 | Sigmoid kernel 74 | ~~~~~~~~~~~~~~~~~~ 75 | 76 | .. autoclass:: SigmoidKernel 77 | :members: mmv, dmmv 78 | :special-members: __call__ 79 | -------------------------------------------------------------------------------- /doc/api_reference/mmv_ops.rst: -------------------------------------------------------------------------------- 1 | falkon.mmv_ops 2 | ============== 3 | 4 | The algorithms to compute kernels and kernel-vector products blockwise on GPUs and CPU. The algorithms in this module 5 | are kernel agnostic. Refer to :mod:`falkon.kernels` for the actual kernel implementations. 6 | 7 | The KeOps wrapper only supports the `mmv` operation (kernel-vector products). The matrix-multiplication implementations 8 | instead support three different operations: 9 | 10 | - `mm` which calculates the full kernel 11 | - `mmv` which calculates kernel-vector products 12 | - `dmmv` which calculates double kernel-vector products (which are operations like :math:`K^\top (K v)` where 13 | :math:`K` is a kernel matrix and :math:`v` is a vector). 14 | 15 | .. automodule:: falkon.mmv_ops 16 | .. py:currentmodule:: falkon.mmv_ops 17 | 18 | run_keops_mmv 19 | ------------- 20 | 21 | A thin wrapper to KeOps is provided to allow for block-splitting and multiple GPU usage. This only supports 22 | kernel-vector products. 23 | 24 | .. autofunction:: falkon.mmv_ops.keops.run_keops_mmv 25 | 26 | 27 | fmm 28 | --- 29 | 30 | Block-wise kernel calculation. If the inputs require gradient, this function uses a differentiable implementation. 31 | 32 | .. autofunction:: falkon.mmv_ops.fmm.fmm 33 | 34 | 35 | 36 | fmmv 37 | ---- 38 | 39 | Block-wise kernel-vector products. 40 | 41 | .. autofunction:: falkon.mmv_ops.fmmv.fmmv 42 | 43 | 44 | 45 | fdmmv 46 | ----- 47 | 48 | Block-wise double kernel-vector products. 49 | 50 | .. autofunction:: falkon.mmv_ops.fmmv.fdmmv 51 | 52 | 53 | 54 | incore_fmmv 55 | ----------- 56 | 57 | .. autofunction:: falkon.mmv_ops.fmmv_incore.incore_fmmv 58 | 59 | 60 | incore_fdmmv 61 | ------------ 62 | 63 | .. autofunction:: falkon.mmv_ops.fmmv_incore.incore_fdmmv 64 | 65 | 66 | Low-level functions 67 | ------------------- 68 | 69 | The following are some of the low-level functions which help compute kernels and kernel-vector products block-wise. 70 | They are specialized for different input types. 71 | 72 | .. autofunction:: falkon.mmv_ops.fmm.sparse_mm_run_thread 73 | 74 | .. autofunction:: falkon.mmv_ops.fmmv.sparse_mmv_run_thread 75 | -------------------------------------------------------------------------------- /doc/api_reference/models.rst: -------------------------------------------------------------------------------- 1 | falkon.models 2 | ============= 3 | 4 | .. automodule:: falkon.models 5 | .. currentmodule:: falkon.models 6 | 7 | 8 | .. _falkon_model: 9 | 10 | Falkon 11 | ------ 12 | 13 | .. autoclass:: Falkon 14 | :members: 15 | :inherited-members: 16 | 17 | .. _log_falkon_model: 18 | 19 | LogisticFalkon 20 | -------------- 21 | 22 | .. autoclass:: LogisticFalkon 23 | :members: 24 | :inherited-members: 25 | 26 | .. _ic_falkon_model: 27 | 28 | InCoreFalkon 29 | ------------ 30 | 31 | .. autoclass:: InCoreFalkon 32 | :members: 33 | :inherited-members: 34 | -------------------------------------------------------------------------------- /doc/api_reference/optimization.rst: -------------------------------------------------------------------------------- 1 | falkon.optim 2 | ============ 3 | 4 | .. automodule:: falkon.optim 5 | .. py:currentmodule:: falkon.optim 6 | 7 | Optimizer 8 | --------- 9 | 10 | .. autoclass:: Optimizer 11 | :members: 12 | 13 | 14 | Conjugate gradient methods 15 | -------------------------- 16 | 17 | ConjugateGradient 18 | ~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: ConjugateGradient 21 | :members: 22 | 23 | FalkonConjugateGradient 24 | ~~~~~~~~~~~~~~~~~~~~~~~ 25 | 26 | .. autoclass:: FalkonConjugateGradient 27 | :members: 28 | -------------------------------------------------------------------------------- /doc/api_reference/options.rst: -------------------------------------------------------------------------------- 1 | .. _api_options: 2 | 3 | falkon.options 4 | ============== 5 | 6 | .. automodule:: falkon.options 7 | .. py:currentmodule:: falkon.options 8 | 9 | 10 | FalkonOptions 11 | ------------- 12 | 13 | .. autoclass:: FalkonOptions 14 | :members: 15 | -------------------------------------------------------------------------------- /doc/api_reference/outofcore.rst: -------------------------------------------------------------------------------- 1 | falkon.ooc_ops 2 | ============== 3 | 4 | The out-of-core algorithms for the Cholesky decomposition and the LAUUM operation are crucial for speeding up our 5 | library. To find out more about how they work, check the source code: 6 | 7 | - `Out of core Cholesky `_ (CUDA code) 8 | - `Out of core LAUUM `_ (Python code) 9 | 10 | The following functions provide a higher-level interface to the two operations. 11 | 12 | .. automodule:: falkon.ooc_ops 13 | .. py:currentmodule:: falkon.ooc_ops 14 | 15 | 16 | gpu_cholesky 17 | ------------ 18 | 19 | .. autofunction:: gpu_cholesky 20 | 21 | 22 | gpu_lauum 23 | --------- 24 | 25 | .. autofunction:: gpu_lauum 26 | -------------------------------------------------------------------------------- /doc/api_reference/preconditioner.rst: -------------------------------------------------------------------------------- 1 | falkon.preconditioner 2 | ===================== 3 | 4 | .. automodule:: falkon.preconditioner 5 | .. py:currentmodule:: falkon.preconditioner 6 | 7 | Preconditioner 8 | -------------- 9 | 10 | .. autoclass:: falkon.preconditioner.preconditioner.Preconditioner 11 | :members: 12 | 13 | 14 | Cholesky preconditioners 15 | ------------------------ 16 | 17 | FalkonPreconditioner 18 | ~~~~~~~~~~~~~~~~~~~~ 19 | 20 | .. autoclass:: FalkonPreconditioner 21 | :members: 22 | :inherited-members: ABC 23 | 24 | LogisticPreconditioner 25 | ~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | .. autoclass:: LogisticPreconditioner 28 | :members: 29 | :inherited-members: ABC 30 | -------------------------------------------------------------------------------- /doc/api_reference/sparse.rst: -------------------------------------------------------------------------------- 1 | 2 | falkon.sparse 3 | ============= 4 | 5 | .. automodule:: falkon.sparse 6 | .. currentmodule:: falkon.sparse 7 | 8 | SparseTensor 9 | ------------ 10 | 11 | .. autoclass:: falkon.sparse.sparse_tensor.SparseTensor 12 | :members: 13 | 14 | .. autoclass:: falkon.sparse.sparse_tensor.SparseType 15 | :members: 16 | 17 | 18 | Sparse operations 19 | ----------------- 20 | 21 | .. autofunction:: sparse_matmul 22 | 23 | .. autofunction:: sparse_square_norm 24 | 25 | .. autofunction:: sparse_norm 26 | -------------------------------------------------------------------------------- /doc/doc-requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | nbsphinx 3 | numpydoc 4 | sphinx-rtd-theme 5 | pandas 6 | matplotlib 7 | jupyter 8 | -------------------------------------------------------------------------------- /doc/examples/examples.rst: -------------------------------------------------------------------------------- 1 | .. _examples: 2 | 3 | Examples 4 | ======== 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | :hidden: 9 | 10 | ./falkon_regression_tutorial.ipynb 11 | ./logistic_falkon.ipynb 12 | ./falkon_cv.ipynb 13 | ./custom_kernels.ipynb 14 | ./hyperopt.ipynb 15 | ./falkon_mnist.ipynb 16 | 17 | .. _Kernel ridge regression: 18 | ./falkon_regression_tutorial.ipynb 19 | 20 | .. _Logistic Falkon tutorial: 21 | ./logistic_falkon.ipynb 22 | 23 | .. _Hyperparameter tuning: 24 | ./falkon_cv.ipynb 25 | 26 | .. _custom kernels: 27 | ./custom_kernels.ipynb 28 | 29 | .. _Gradient hyperopt: 30 | ./hyperopt.ipynb 31 | 32 | .. _MNIST example: 33 | ./falkon_mnist.ipynb 34 | 35 | 36 | Starting with simple kernel ridge regression, via classification, hyperparameter tuning, to large-scale GPU experiments, 37 | these notebooks cover all there is to know about Falkon. 38 | 39 | - `Kernel ridge regression`_ goes through the basic notions of the library with a simple example; 40 | - `Logistic Falkon tutorial`_ shows how to use the Logistic Falkon estimator, comparing the results with normal Falkon; 41 | - `Hyperparameter tuning`_ is a fully worked out example of optimizing hyperparameters with cross-validation for a real-world multi-class problem; 42 | - `custom kernels`_ will walk you through the implementation of a custom kernel. 43 | - `Gradient hyperopt`_: a tutorial on using the :mod:`~falkon.hopt` module for gradient-based hyperparameter optimization in Falkon. 44 | - `MNIST example`_: A simple tutorial on using Falkon for MNIST digit classification. 45 | 46 | -------------------------------------------------------------------------------- /doc/get_started.rst: -------------------------------------------------------------------------------- 1 | .. _get_started: 2 | 3 | Getting Started 4 | =============== 5 | 6 | Once Falkon is installed, getting started is easy. The basic setup to use the `Falkon` estimator only requires 7 | few lines of code: 8 | 9 | .. code-block:: python 10 | 11 | import torch 12 | from sklearn.datasets import load_boston 13 | from falkon import Falkon, kernels 14 | 15 | X, Y = load_boston(return_X_y=True) 16 | X = torch.from_numpy(X) 17 | Y = torch.from_numpy(Y).reshape(-1, 1) 18 | 19 | kernel = kernels.GaussianKernel(sigma=1.0) 20 | model = Falkon( 21 | kernel=kernel, 22 | penalty=1e-6, 23 | M=100, 24 | ) 25 | model.fit(X, Y) 26 | predictions = model.predict(X) 27 | 28 | 29 | Passing Options 30 | ~~~~~~~~~~~~~~~ 31 | 32 | A number of different options exist for both the :ref:`Falkon ` and :ref:`LogisticFalkon ` 33 | estimators (see :ref:`falkon.FalkonOptions `). 34 | All options can be passed to the estimator through the :class:`~falkon.options.FalkonOptions` class, like so: 35 | 36 | .. code-block:: python 37 | 38 | from falkon import FalkonOptions, Falkon, kernels 39 | 40 | # Options to: increase the amount of output information; avoid using the KeOps library 41 | options = FalkonOptions(debug=True, keops_active="no") 42 | kernel = kernels.GaussianKernel(sigma=1.0) 43 | 44 | model = Falkon(kernel=kernel, 45 | penalty=1e-6, 46 | M=100, 47 | maxiter=10, # Set the maximum number of conjugate gradient iterations to 10 48 | options=options) 49 | 50 | 51 | More Examples 52 | ~~~~~~~~~~~~~ 53 | 54 | For more detailed examples, have a look at the :ref:`example notebooks `. 55 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. falkon documentation master file, created by 2 | sphinx-quickstart on Thu Jun 18 10:38:58 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ================================== 7 | Falkon 8 | ================================== 9 | A Python library for large-scale kernel methods, with optional (multi-)GPU acceleration. 10 | 11 | The library currently includes two solvers: 12 | one for approximate kernel ridge regression :ref:`[2] ` which is extremely fast, and one for kernel logistic 13 | regression :ref:`[3] ` which trades off lower speed for better accuracy on binary classification problems. 14 | 15 | The main features of Falkon are: 16 | 17 | * *Full multi-GPU support* - All compute-intensive parts of the algorithms are multi-GPU capable. 18 | * *Extreme scalability* - Unlike other kernel solvers, we keep memory usage in check. We have tested the library with 19 | datasets of billions of points. 20 | * *Sparse data support* 21 | * *Scikit-learn integration* - Our estimators follow the scikit-learn API 22 | 23 | For more details about the algorithms used, you can read :ref:`our paper `, or look at the source code 24 | `github.com/FalkonML/falkon `_ and at the :ref:`documentation `. 25 | Also, make sure to follow :ref:`the example notebooks ` to find out about all of Falkon's features. 26 | 27 | Falkon is built on top of `PyTorch `__ which is used to support both CPU and GPU tensor 28 | calculations, and `KeOps `__ for fast kernel evaluations on the GPU. 29 | 30 | If you find this library useful for your research, please cite our paper :ref:`[1] `! 31 | 32 | Contents 33 | ======== 34 | 35 | .. toctree:: 36 | :maxdepth: 2 37 | 38 | install 39 | get_started 40 | examples/examples 41 | api_reference/index 42 | 43 | 44 | References 45 | ========== 46 | 47 | .. _flk_1: 48 | 49 | Giacomo Meanti, Luigi Carratino, Lorenzo Rosasco, Alessandro Rudi, 50 | "Kernel methods through the roof: handling billions of points efficiently," 51 | Advancs in Neural Information Processing Systems, 2020. 52 | 53 | .. _flk_2: 54 | 55 | Alessandro Rudi, Luigi Carratino, Lorenzo Rosasco, "FALKON: An optimal large scale kernel method," 56 | Advances in Neural Information Processing Systems, 2017. 57 | 58 | .. _log_flk: 59 | 60 | Ulysse Marteau-Ferey, Francis Bach, Alessandro Rudi, "Globally Convergent Newton Methods for Ill-conditioned 61 | Generalized Self-concordant Losses," Advances in Neural Information Processing Systems, 2019. 62 | 63 | 64 | Indices and tables 65 | ================== 66 | 67 | * :ref:`genindex` 68 | * :ref:`modindex` 69 | * :ref:`search` 70 | -------------------------------------------------------------------------------- /doc/install.rst: -------------------------------------------------------------------------------- 1 | .. _install: 2 | 3 | Install 4 | ======= 5 | 6 | Supported Platforms 7 | ------------------- 8 | 9 | Falkon is only tested on Linux. 10 | 11 | GPU support is achieved via CUDA, so using Falkon from Windows will be tricky. 12 | 13 | Using the library from the CPU only is much more portable since there is no CUDA requirement, and it will likely work 14 | on Windows and Mac OS more easily. 15 | 16 | Prerequisites 17 | ------------- 18 | 19 | PyTorch and CUDA 20 | ~~~~~~~~~~~~~~~~ 21 | Falkon depends on PyTorch, and on the NVIDIA Toolkit (for NVIDIA GPU support) which is usually installed 22 | alongside PyTorch. 23 | PyTorch can be installed in several ways by following the instructions at `Install PyTorch `__. 24 | If GPU support is desired, make sure that PyTorch CUDA bindings are working: 25 | 26 | .. code-block:: python 27 | 28 | import torch 29 | assert torch.cuda.is_available() 30 | 31 | Intel MKL 32 | ~~~~~~~~~ 33 | If Falkon is not installed with GPU support, it will try to link to `Intel MKL `__ 34 | to speed-up certain sparse operations. MKL shared libraries are usually distributed with numpy, so this should not be a problem. 35 | In case sparse matrix support is not needed, the MKL library will not be loaded. 36 | 37 | 38 | 39 | Installing 40 | ---------- 41 | 42 | There are three ways of installing Falkon: 43 | 44 | 1. **From source** by running 45 | 46 | .. code-block:: bash 47 | 48 | $ pip install --no-build-isolation git+https://github.com/falkonml/falkon.git 49 | 50 | 2. **From pypi with JIT compilation** (the C++ extension will be compiled when the library is first used) **BROKEN!!**: 51 | 52 | .. code-block:: bash 53 | 54 | $ pip install falkon 55 | 56 | 3. **From pre-built wheels** which are available for the following versions of PyTorch and CUDA: 57 | 58 | ============== ========= ========= ========= ========= 59 | Linux `cu116` `cu117` `cu118` `cu121` 60 | ============== ========= ========= ========= ========= 61 | torch 1.13.0 ✅ ✅ 62 | torch 2.0.0 ✅ ✅ 63 | torch 2.1.0 ✅ ✅ 64 | ============== ========= ========= ========= ========= 65 | 66 | As an example, if you **already have installed** PyTorch 1.13 and CUDA 11.7 on your system, you should run 67 | 68 | .. code-block:: bash 69 | 70 | $ pip install falkon -f https://falkon.dibris.unige.it/torch-1.13.0_cu117.html 71 | 72 | Similarly for **CPU-only packages** 73 | 74 | .. code-block:: bash 75 | 76 | $ pip install falkon -f https://falkon.dibris.unige.it/torch-2.0.0_cpu.html 77 | 78 | please check `here `__ for a list of supported wheels. 79 | 80 | For options 1 and 2, you will need the CUDA toolkit to be setup properly on your system in order to compile the sources. 81 | Compilation may take a few minutes. To speed it up you can try to install ``ninja`` (``pip install ninja``) which 82 | parallelizes the build process. 83 | 84 | 85 | Testing the installation 86 | ------------------------ 87 | 88 | To check that everything works correctly you can follow the `Kernel ridge regression `_ notebook. 89 | 90 | 91 | 92 | Development 93 | ----------- 94 | 95 | For development purposes the library should be installed in editable mode (i.e. `pip install -e .` from the 96 | falkon directory. 97 | 98 | To build the documentation go into the `doc` directory and run `make html`. 99 | -------------------------------------------------------------------------------- /falkon/VERSION: -------------------------------------------------------------------------------- 1 | 0.8.5 2 | -------------------------------------------------------------------------------- /falkon/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .options import FalkonOptions # isort:skip 4 | from . import ( # isort:skip 5 | center_selection, 6 | sparse, 7 | kernels, 8 | preconditioner, 9 | optim, 10 | gsc_losses, 11 | hopt, 12 | ) 13 | from .models import Falkon, InCoreFalkon, LogisticFalkon # isort:skip 14 | 15 | # Set __version__ attribute on the package 16 | init_dir = os.path.dirname(os.path.abspath(__file__)) 17 | with open(os.path.join(init_dir, "VERSION")) as version_file: 18 | __version__ = version_file.read().strip() 19 | 20 | __all__ = ( 21 | "Falkon", 22 | "LogisticFalkon", 23 | "InCoreFalkon", 24 | "FalkonOptions", 25 | "kernels", 26 | "optim", 27 | "preconditioner", 28 | "center_selection", 29 | "sparse", 30 | "gsc_losses", 31 | "hopt", 32 | "__version__", 33 | ) 34 | -------------------------------------------------------------------------------- /falkon/benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.txt 3 | *.png 4 | *.npy 5 | logs/* -------------------------------------------------------------------------------- /falkon/benchmarks/README.md: -------------------------------------------------------------------------------- 1 | ## Benchmark Scripts 2 | 3 | This folder contains the code necessary to reproduce the benchmark results of the paper: [Kernel methods through the roof: handling billions of points efficiently](https://arxiv.org/abs/2006.10350). 4 | 5 | It contains code for defining [GPyTorch](https://gpytorch.ai/) and [GPFlow](https://www.gpflow.org/) models, 6 | for data preprocessing (see the `datasets.py` file), and for running all standard benchmarks (see `benchmark_runner.py`). 7 | The individual bash files are used as drivers which call the benchmark runner with different parameters. 8 | The [EigenPro](https://github.com/EigenPro/EigenPro2) model code is missing from here, 9 | but is very similar to the publicly available code, and is available on request. 10 | 11 | 12 | Other benchmarks are also run with scripts from this folder: 13 | - The out-of-core operation timings can be run with `potrf_timings.py` and `lauum_timings.py` and their respective drivers 14 | - The kernel matrix-vector multiplication experiment can be run with `mmv_timings.py`. 15 | - The experiment to measure timings with different features turned on is available in `time_improvements.py`. 16 | -------------------------------------------------------------------------------- /falkon/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/benchmarks/__init__.py -------------------------------------------------------------------------------- /falkon/benchmarks/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/benchmarks/common/__init__.py -------------------------------------------------------------------------------- /falkon/benchmarks/common/benchmark_utils.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | __all__ = ("DataType", "Algorithm", "Dataset", "VariationalDistribution") 4 | 5 | 6 | class DataType(Enum): 7 | single = 1 8 | float32 = 2 9 | 10 | double = 11 11 | float64 = 12 12 | 13 | def to_torch_dtype(self): 14 | import torch 15 | 16 | if self.value < 10: 17 | return torch.float32 18 | else: 19 | return torch.float64 20 | 21 | def to_numpy_dtype(self): 22 | import numpy as np 23 | 24 | if self.value < 10: 25 | return np.float32 26 | else: 27 | return np.float64 28 | 29 | def __str__(self): 30 | return self.name 31 | 32 | def __repr__(self): 33 | return str(self) 34 | 35 | @staticmethod 36 | def argparse(s): 37 | try: 38 | return DataType[s] 39 | except KeyError: 40 | return s 41 | 42 | 43 | class Algorithm(Enum): 44 | FALKON = "falkon" 45 | LOGISTIC_FALKON = "falkon-cls" 46 | EIGENPRO = "eigenpro" 47 | GPYTORCH_REG = "gpytorch-reg" 48 | GPFLOW_REG = "gpflow-reg" 49 | GPYTORCH_CLS = "gpytorch-cls" 50 | GPFLOW_CLS = "gpflow-cls" 51 | GPYTORCH_SGPR = "gpytorch-sgpr" 52 | GPFLOW_SGPR = "gpflow-sgpr" 53 | 54 | def __str__(self): 55 | return self.value 56 | 57 | def __repr__(self): 58 | return str(self) 59 | 60 | 61 | class Dataset(Enum): 62 | TIMIT = "timit" 63 | MILLIONSONGS = "millionsongs" 64 | HIGGS = "higgs" 65 | TAXI = "taxi" 66 | YELP = "yelp" 67 | FLIGHTS = "flights" 68 | FLIGHTS_CLS = "flights-cls" 69 | SUSY = "susy" 70 | MNIST_SMALL = "mnist-small" 71 | SVHN = "svhn" 72 | MNIST = "mnist" 73 | CIFAR10 = "cifar10" 74 | CIFAR10RGB = "cifar10-rgb" 75 | HOHIGGS = "ho-higgs" 76 | ICTUS = "ictus" 77 | SYNTH01NOISE = "synth-01noise" 78 | CHIET = "chiet" 79 | ENERGY = "energy" 80 | BOSTON = "boston" 81 | PROTEIN = "protein" 82 | KIN40K = "kin40k" 83 | CODRNA = "codrna" 84 | SVMGUIDE1 = "svmguide1" 85 | PHISHING = "phishing" 86 | SPACEGA = "spacega" 87 | CADATA = "cadata" 88 | MG = "mg" 89 | CPUSMALL = "cpusmall" 90 | ABALONE = "abalone" 91 | CASP = "casp" 92 | BLOGFEEDBACK = "blogfeedback" 93 | COVTYPE = "covtype" 94 | IJCNN1 = "ijcnn1" 95 | FASHION_MNIST = "fashionmnist" 96 | BUZZ = "buzz" 97 | ROAD3D = "road3d" 98 | HOUSEELECTRIC = "houseelectric" 99 | 100 | def __str__(self): 101 | return self.value 102 | 103 | def __repr__(self): 104 | return str(self) 105 | 106 | 107 | class VariationalDistribution(Enum): 108 | FULL = "full" 109 | DIAG = "diag" 110 | DELTA = "delta" 111 | NATGRAD = "natgrad" 112 | TRIL_NATGRAD = "tril_natgrad" 113 | 114 | def __str__(self): 115 | return self.value 116 | 117 | def __repr__(self): 118 | return str(self) 119 | -------------------------------------------------------------------------------- /falkon/benchmarks/common/create_weather_dataset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import h5py 4 | import numpy as np 5 | 6 | horizon = 6 7 | memory = 72 8 | input_name = "smz_CHIET.pkl" 9 | output_name = "CHIET.hdf5" 10 | 11 | o_index = horizon - 1 12 | 13 | with open(input_name, "rb") as fh: 14 | data = pickle.load(fh) 15 | 16 | otime = np.array(data["otime"])[:, o_index] 17 | zout = np.array(data["O_zonal"])[:, o_index] 18 | mout = np.array(data["O_merid"])[:, o_index] 19 | sout = np.array(data["O_speed"])[:, o_index] 20 | 21 | itime = np.array(data["itime"])[:, -memory:] 22 | zinp = np.array(data["I_zonal"])[:, -memory:] 23 | minp = np.array(data["I_merid"])[:, -memory:] 24 | sinp = np.array(data["I_speed"])[:, -memory:] 25 | 26 | X = np.concatenate((zinp, minp), axis=1) 27 | Y = sout.reshape(-1, 1) 28 | 29 | time_thresh = np.datetime64("2018-01-01") 30 | tr_index = otime < time_thresh 31 | 32 | train_x = X[tr_index, :] 33 | test_x = X[~tr_index, :] 34 | train_y = Y[tr_index, :] 35 | test_y = Y[~tr_index, :] 36 | 37 | with h5py.File(output_name, "w") as fh: 38 | fh.create_dataset("X_train", data=train_x) 39 | fh.create_dataset("X_test", data=test_x) 40 | fh.create_dataset("Y_train", data=train_y) 41 | fh.create_dataset("Y_test", data=test_y) 42 | -------------------------------------------------------------------------------- /falkon/benchmarks/common/summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from torch.utils.tensorboard import SummaryWriter 4 | 5 | __all__ = ("get_writer",) 6 | 7 | LOG_DIR = "./logs/tensorboard" 8 | _writer = None 9 | 10 | 11 | def get_writer(name=None): 12 | global _writer 13 | if _writer is not None: 14 | return _writer 15 | 16 | log_dir = LOG_DIR 17 | if name is not None: 18 | log_dir = os.path.join(log_dir, name) 19 | 20 | _writer = SummaryWriter(log_dir=log_dir, max_queue=5, flush_secs=30) 21 | return _writer 22 | -------------------------------------------------------------------------------- /falkon/benchmarks/falkon_benchmarks/benchmark_mnist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | # Prepare GPU 7 | export CUDA_VISIBLE_DEVICES="0,1" 8 | 9 | # Prepare conda 10 | CONDA_BASE=$(conda info --base) 11 | source $CONDA_BASE/etc/profile.d/conda.sh 12 | 13 | # Common variables 14 | DSET="mnist" 15 | PY_LAUNCHER="benchmark_runner.py" 16 | 17 | # Falkon (32) 18 | if [ true = true ]; then 19 | ALGO="falkon" 20 | M=100000 21 | TYPE="float32" 22 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${TYPE}.txt" 23 | conda activate torch 24 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -e 20 --sigma 4.4 --penalty 1e-8 \ 25 | -M $M -t $TYPE --kernel gaussian --seed 12 2>&1 | tee -a $OUTFILE 26 | conda deactivate 27 | fi 28 | 29 | # GPytorch 30 | if [ true = false ]; then 31 | ALGO="gpytorch-cls" 32 | M=1000 33 | VAR="diag" 34 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 35 | conda activate torch 36 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 37 | --lr 0.001 --batch-size 4096 --learn-hyperparams \ 38 | --var-dist $VAR --sigma 5 -e 15 --seed 12 2>&1 | tee -a $OUTFILE 39 | conda deactivate 40 | fi 41 | 42 | # GPFlow 43 | if [ true = false ]; then 44 | ALGO="gpflow-cls" 45 | M=500 46 | VAR="full" 47 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 48 | conda activate gpflow 49 | echo "Running ${ALGO} on ${DSET} data, log will be saved in ${OUTFILE}" 50 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 51 | --var-dist ${VAR} --sigma 5 --batch-size 4096 --learn-hyperparams \ 52 | --lr 0.005 --natgrad-lr 0.0001 --epochs 10000 \ 53 | --seed 12 2>&1 | tee -a $OUTFILE 54 | conda deactivate 55 | echo "${ALGO} on ${DSET} data complete..." 56 | fi 57 | 58 | # EigenPro 59 | if [ true = true ]; then 60 | ALGO="eigenpro" 61 | OUTFILE="logs/${DSET}_${ALGO}.txt" 62 | conda activate epro2 63 | echo "Running ${ALGO} on ${DSET} data, log will be saved in ${OUTFILE}" 64 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET --sigma 5 -e 5 --seed 12 --data-subsample 1000000 2>&1 | tee -a $OUTFILE 65 | conda deactivate 66 | echo "${ALGO} on ${DSET} data complete..." 67 | fi 68 | -------------------------------------------------------------------------------- /falkon/benchmarks/falkon_benchmarks/benchmark_taxi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | # Prepare conda 7 | CONDA_BASE=$(conda info --base) 8 | source $CONDA_BASE/etc/profile.d/conda.sh 9 | 10 | DSET="taxi" 11 | PY_LAUNCHER="benchmark_runner.py" 12 | TRAINING_POINTS=1000000000 13 | 14 | export CUDA_VISIBLE_DEVICES="0,1" 15 | 16 | # Falkon 64 17 | if [ true = false ]; then 18 | ALGO="falkon" 19 | M=80000 20 | TYPE="float64" 21 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${TYPE}.txt" 22 | conda activate torch 23 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET -t $TYPE --sigma 1 --penalty 1e-7 --kernel laplacian 2>&1 | tee -a $OUTFILE 24 | conda deactivate 25 | fi 26 | 27 | # Falkon 32 28 | if [ false = true ]; then 29 | ALGO="falkon" 30 | M=100000 31 | TYPE="float32" 32 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${TYPE}.txt" 33 | conda activate torch 34 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 10 -d $DSET -t $TYPE \ 35 | --sigma 1.0 --penalty 2e-7 --kernel gaussian --seed 12 2>&1 | tee -a $OUTFILE 36 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 10 -d $DSET -t $TYPE \ 37 | --sigma 1.0 --penalty 2e-7 --kernel gaussian --seed 13 2>&1 | tee -a $OUTFILE 38 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 10 -d $DSET -t $TYPE \ 39 | --sigma 1.0 --penalty 2e-7 --kernel gaussian --seed 14 2>&1 | tee -a $OUTFILE 40 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 10 -d $DSET -t $TYPE \ 41 | --sigma 1.0 --penalty 2e-7 --kernel gaussian --seed 15 2>&1 | tee -a $OUTFILE 42 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 10 -d $DSET -t $TYPE \ 43 | --sigma 1.0 --penalty 2e-7 --kernel gaussian --seed 16 2>&1 | tee -a $OUTFILE 44 | conda deactivate 45 | fi 46 | 47 | # GPytorch 48 | if [ true = true ]; then 49 | ALGO="gpytorch-reg" 50 | M=1000 51 | VAR="natgrad" 52 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 53 | BATCH_SIZE=32000 54 | LR=0.002 55 | NATGRAD_LR=0.002 56 | EPOCHS=5 57 | conda activate torch 58 | #PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 59 | # --batch-size $BATCH_SIZE --var-dist $VAR --lr $LR --natgrad-lr $NATGRAD_LR --sigma 1 \ 60 | # --epochs $EPOCHS --learn-hyperparams --seed 12 2>&1 | tee -a $OUTFILE 61 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 62 | --batch-size $BATCH_SIZE --var-dist $VAR --lr $LR --natgrad-lr $NATGRAD_LR --sigma 1 \ 63 | --epochs $EPOCHS --learn-hyperparams --seed 13 2>&1 | tee -a $OUTFILE 64 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 65 | --batch-size $BATCH_SIZE --var-dist $VAR --lr $LR --natgrad-lr $NATGRAD_LR --sigma 1 \ 66 | --epochs $EPOCHS --learn-hyperparams --seed 14 2>&1 | tee -a $OUTFILE 67 | conda deactivate 68 | fi 69 | 70 | # GPFlow 71 | if [ false = true ]; then 72 | ALGO="gpflow-reg" 73 | M=1000 74 | VAR=diag 75 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 76 | BATCH_SIZE=32000 77 | EPOCHS=$(( $TRAINING_POINTS / $BATCH_SIZE * 10 )) 78 | ERROR_EVERY=30000 # This is one epoch 79 | conda activate gpflow 80 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 81 | --var-dist ${VAR} --sigma 1 --batch-size $BATCH_SIZE \ 82 | --lr 0.003 --natgrad-lr 0.0000 --epochs $EPOCHS --error-every $ERROR_EVERY \ 83 | --learn-hyperparams --seed 16 2>&1 | tee -a $OUTFILE 84 | conda deactivate 85 | fi 86 | 87 | -------------------------------------------------------------------------------- /falkon/benchmarks/falkon_benchmarks/benchmark_yelp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | # Prepare conda 7 | CONDA_BASE=$(conda info --base) 8 | source $CONDA_BASE/etc/profile.d/conda.sh 9 | 10 | DSET="yelp" 11 | PY_LAUNCHER="benchmark_runner.py" 12 | export CUDA_VISIBLE_DEVICES="0,1" 13 | 14 | # Falkon 64 15 | if [ true = false ]; then 16 | ALGO="falkon" 17 | M=50000 18 | TYPE="float64" 19 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${TYPE}.txt" 20 | conda activate torch 21 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 22 | -t $TYPE --sigma 31.4 --kernel linear \ 23 | --penalty 1e-7 2>&1 | tee -a $OUTFILE 24 | conda deactivate 25 | fi 26 | 27 | # Falkon 32 28 | if [ true = true ]; then 29 | ALGO="falkon" 30 | M=50000 31 | TYPE="float32" 32 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${TYPE}.txt" 33 | conda activate torch 34 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 35 | -t $TYPE --sigma 20.0 --kernel gaussian \ 36 | --penalty 1e-6 --seed 12 2>&1 | tee -a $OUTFILE 37 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 38 | -t $TYPE --sigma 20.0 --kernel gaussian \ 39 | --penalty 1e-6 --seed 13 2>&1 | tee -a $OUTFILE 40 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 41 | -t $TYPE --sigma 20.0 --kernel gaussian \ 42 | --penalty 1e-6 --seed 14 2>&1 | tee -a $OUTFILE 43 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 44 | -t $TYPE --sigma 20.0 --kernel gaussian \ 45 | --penalty 1e-6 --seed 15 2>&1 | tee -a $OUTFILE 46 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -M $M -e 20 -d $DSET \ 47 | -t $TYPE --sigma 20.0 --kernel gaussian \ 48 | --penalty 1e-6 --seed 16 2>&1 | tee -a $OUTFILE 49 | conda deactivate 50 | fi 51 | 52 | # GPytorch 53 | if [ true = false ]; then 54 | ALGO="gpytorch-reg" 55 | M=1000 56 | VAR="diag" 57 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 58 | conda activate torch 59 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M --batch-size 4096 \ 60 | --var-dist $VAR --lr 0.01 --sigma 6 -e 100 2>&1 | tee -a $OUTFILE 61 | conda deactivate 62 | fi 63 | 64 | # GPFlow 65 | if [ true = false ]; then 66 | ALGO="gpflow-reg" 67 | M=100 68 | VAR="diag" 69 | OUTFILE="logs/${DSET}_${ALGO}_${M}_${VAR}.txt" 70 | conda activate gpflow 71 | echo "Running ${ALGO} on ${DSET} data, log will be saved in ${OUTFILE}" 72 | PYTHONPATH='..' python $PY_LAUNCHER -a $ALGO -d $DSET -M $M \ 73 | --var-dist ${VAR} --sigma 5 --batch-size 1024 --learn-hyperparams \ 74 | --lr 0.005 --natgrad-lr 0.000 --epochs 2000 --error-every 10 \ 75 | --seed 12 2>&1 | tee -a $OUTFILE 76 | conda deactivate 77 | echo "${ALGO} on ${DSET} data complete..." 78 | fi 79 | 80 | -------------------------------------------------------------------------------- /falkon/benchmarks/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/benchmarks/models/__init__.py -------------------------------------------------------------------------------- /falkon/benchmarks/run_hgrad_benchmark.sh: -------------------------------------------------------------------------------- 1 | 2 | function run_exp () { 3 | local SIG_INIT=$1 4 | local PEN_INIT=$2 5 | local LR=$3 6 | local M=$4 7 | local DATASET=$5 8 | local ENAME=$6 9 | local VAL_PCT=$7 10 | local MODEL=$8 11 | PYTHONPATH=.. python hgrad_benchmarks/simple_hopt.py \ 12 | --seed 12319 \ 13 | --cg-tol 1e-3 \ 14 | --val-pct $VAL_PCT \ 15 | --sigma-type single \ 16 | --sigma-init $SIG_INIT \ 17 | --penalty-init $PEN_INIT \ 18 | --lr $LR \ 19 | --epochs $NUM_EPOCHS \ 20 | --op \ 21 | --os \ 22 | --num-centers $M \ 23 | --dataset $DATASET \ 24 | --model $MODEL \ 25 | --cuda \ 26 | --name "${DATASET}_hopt_${MODEL}_${ENAME}" 27 | } 28 | 29 | function run_exp_all_models () { 30 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "loocv" 31 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "sgpr" 32 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "gcv" 33 | # run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "hgrad-ift" 34 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "hgrad-closed" 35 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "creg-nopenfit" 36 | run_exp "$1" "$2" "$3" "$4" "$5" "$6" "$7" "creg-penfit" 37 | } 38 | 39 | 40 | M=20 41 | DATASET=boston 42 | LR=0.02 43 | NUM_EPOCHS=200 44 | VAL_PCT=0.2 45 | 46 | PEN_INIT=1e-4 47 | SIG_INIT=15 48 | ENAME="test_hopt_rmsprop_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 49 | run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 50 | 51 | PEN_INIT=1 52 | SIG_INIT=15 53 | ENAME="test_hopt_rmsprop_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 54 | run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 55 | 56 | 57 | # VAL_PCT=0.2 58 | # ENAME="test_hopt_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 59 | # run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 60 | # 61 | # VAL_PCT=0.4 62 | # ENAME="test_hopt_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 63 | # run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 64 | # 65 | # VAL_PCT=0.6 66 | # ENAME="test_hopt_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 67 | # run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 68 | # 69 | # VAL_PCT=0.8 70 | # ENAME="test_hopt_m${M}_lr${LR}_pinit${PEN_INIT}sinit${SIG_INIT}_meanrem_val${VAL_PCT}" 71 | # run_exp_all_models "$SIG_INIT" "$PEN_INIT" "$LR" "$M" "$DATASET" "$ENAME" "$VAL_PCT" 72 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/lauum_timings.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import time 4 | from typing import Any, Dict, List 5 | 6 | import numpy as np 7 | import torch 8 | from scipy.linalg.lapack import dlauum, slauum 9 | 10 | import falkon 11 | from falkon.ooc_ops.ooc_lauum import gpu_lauum 12 | from falkon.utils import devices 13 | 14 | 15 | def gen_random(a, b, dtype, F=False, seed=0): 16 | rng = np.random.default_rng(seed) 17 | out = rng.random(size=(a, b), dtype=dtype) 18 | if F: 19 | return out.T 20 | return out 21 | 22 | 23 | def run_lauum_exp(exp_name, fn, exp_sizes, num_reps, is_torch, dtype): 24 | timings = [] 25 | for num_pts in exp_sizes: 26 | A = gen_random(num_pts, num_pts, dtype=dtype, F=True, seed=123) 27 | 28 | rep_times = [] 29 | for j in range(num_reps): 30 | if is_torch: 31 | Ac = torch.from_numpy(A.copy(order="C")) 32 | else: 33 | Ac = A.copy(order="F") 34 | t_s = time.time() 35 | fn(Ac) 36 | t_e = time.time() 37 | rep_times.append(t_e - t_s) 38 | print("Exp %s - N %d - Rep %d - %.2fs" % (exp_name, num_pts, j, rep_times[-1]), flush=True) 39 | del Ac 40 | if is_torch: 41 | torch.cuda.empty_cache() 42 | timings.append(min(rep_times)) 43 | return timings 44 | 45 | 46 | if __name__ == "__main__": 47 | init_opt = falkon.FalkonOptions(compute_arch_speed=False) 48 | gpu_info = [v for k, v in devices.get_device_info(init_opt).items() if k >= 0] 49 | num_gpu = len(gpu_info) 50 | 51 | experiments: List[Dict[str, Any]] = [ 52 | { 53 | "exp_name": "OOC 32", 54 | "exp_sizes": [10_000, 20_000, 30_000, 40_000, 50_000, 75_000, 100_000, 120_000, 140_000], 55 | "dtype": np.float32, 56 | "num_reps": 5, 57 | "is_torch": True, 58 | "fn": functools.partial( 59 | gpu_lauum, 60 | upper=False, 61 | overwrite=True, 62 | write_opposite=True, 63 | opt=falkon.FalkonOptions(compute_arch_speed=False), 64 | ), 65 | }, 66 | { 67 | "exp_name": "OOC 64", 68 | "exp_sizes": [10_000, 20_000, 30_000, 40_000, 50_000], 69 | "dtype": np.float64, 70 | "num_reps": 5, 71 | "is_torch": True, 72 | "fn": functools.partial( 73 | gpu_lauum, 74 | upper=False, 75 | overwrite=True, 76 | write_opposite=True, 77 | opt=falkon.FalkonOptions(compute_arch_speed=False), 78 | ), 79 | }, 80 | { 81 | "exp_name": "CPU 32", 82 | "exp_sizes": [10_000, 20_000, 30_000, 40_000, 50_000, 75_000, 100_000], 83 | "dtype": np.float32, 84 | "num_reps": 3, 85 | "is_torch": False, 86 | "fn": functools.partial(slauum, lower=1, overwrite_c=True), 87 | }, 88 | { 89 | "exp_name": "CPU 64", 90 | "exp_sizes": [10_000, 20_000, 30_000, 40_000, 50_000, 75_000, 100_000], 91 | "dtype": np.float64, 92 | "num_reps": 3, 93 | "is_torch": False, 94 | "fn": functools.partial(dlauum, lower=1, overwrite_c=True), 95 | }, 96 | ] 97 | for exp in experiments: 98 | exp_times = run_lauum_exp(**exp) 99 | exp["timings"] = exp_times 100 | with open("logs/lauum_timings_%dGPU.json" % (num_gpu), "w") as fh: 101 | json.dump(experiments, fh) 102 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/lauum_timings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | echo "Running with 1 GPU" 7 | export CUDA_VISIBLE_DEVICES="0" 8 | python lauum_timings.py 2>&1 | tee -a "logs/lauum_timings_1GPU.txt" 9 | 10 | echo "Running with 2 GPUs" 11 | export CUDA_VISIBLE_DEVICES="0,1" 12 | python lauum_timings.py 2>&1 | tee -a "logs/lauum_timings_2GPU.txt" 13 | exit 1; 14 | 15 | 16 | echo "Running with 3 GPUs" 17 | export CUDA_VISIBLE_DEVICES="0,1,2" 18 | python lauum_timings.py 2>&1 | tee -a "logs/lauum_timings_3GPU.txt" 19 | 20 | echo "Running with 4 GPUs" 21 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 22 | python lauum_timings.py 2>&1 | tee -a "logs/lauum_timings_4GPU.txt" 23 | 24 | echo "Running with 5 GPUs" 25 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4" 26 | python lauum_timings.py 2>&1 | tee -a "logs/lauum_timings_5GPU.txt" 27 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/mm_timings.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import kernels 4 | import numpy as np 5 | import torch 6 | 7 | import falkon 8 | 9 | 10 | def gen_random(a, b, dtype, F=False, seed=0): 11 | rng = np.random.default_rng(seed) 12 | out = rng.random(size=(a, b), dtype=dtype) 13 | if F: 14 | return out.T 15 | return out 16 | 17 | 18 | def run_mm_exp(exp_name, kernel, N, D, pin_memory, num_reps): 19 | timings = [] 20 | A = torch.randn(N, D, dtype=torch.float32) 21 | if pin_memory: 22 | A = A.pin_memory() 23 | 24 | for _ in range(num_reps): 25 | t_s = time.time() 26 | kernel(A) 27 | torch.cuda.synchronize() 28 | t_e = time.time() 29 | timings.append(t_e - t_s) 30 | print(f"{exp_name} - {N=} {D=} {pin_memory=} - {t_e - t_s:.2f}s", flush=True) 31 | print(f"\t min={np.min(timings):.2f}s") 32 | return np.min(timings) 33 | 34 | 35 | if __name__ == "__main__": 36 | N = 50_000 37 | D = 256 38 | for no_single_kernel in [True, False]: 39 | init_opt = falkon.FalkonOptions(compute_arch_speed=False, no_single_kernel=no_single_kernel) 40 | kernel = kernels.GaussianKernel(sigma=5.0, opt=init_opt) 41 | exp_name = f"exp-{no_single_kernel=}" 42 | run_mm_exp(exp_name=exp_name, kernel=kernel, N=N, D=D, pin_memory=True, num_reps=5) 43 | print() 44 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/mmv_timings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | echo "Running with 1 GPU" 7 | export CUDA_VISIBLE_DEVICES="0" 8 | python mmv_timings.py --num-gpus 1 2>&1 | tee -a "logs/mmv_timings_1GPU.txt" 9 | exit 1; 10 | 11 | echo "Running with 2 GPUs" 12 | export CUDA_VISIBLE_DEVICES="0,1" 13 | python mmv_timings.py --num-gpus 2 2>&1 | tee -a "logs/mmv_timings_2GPU.txt" 14 | 15 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/potrf_timings.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import time 4 | from typing import Any, Dict, List 5 | 6 | import numpy as np 7 | import torch 8 | from scipy.linalg.lapack import dpotrf, spotrf 9 | 10 | import falkon 11 | from falkon.ooc_ops.ooc_potrf import gpu_cholesky 12 | from falkon.utils import devices 13 | 14 | 15 | def gen_random(a, b, dtype, F=False, seed=0): 16 | rng = np.random.default_rng(seed) 17 | out = rng.random(size=(a, b), dtype=dtype) 18 | if F: 19 | return out.T 20 | return out 21 | 22 | 23 | def gen_random_pd(t, dtype, F=False, seed=0): 24 | A = gen_random(t, t, dtype, F, seed) 25 | A = A + A.T 26 | A.flat[:: t + 1] += t 27 | return A 28 | 29 | 30 | def run_potrf_exp(exp_name, fn, exp_sizes, num_reps, is_torch, dtype): 31 | timings = [] 32 | for num_pts in exp_sizes: 33 | A = gen_random_pd(num_pts, dtype, F=True, seed=192) 34 | 35 | rep_times = [] 36 | for j in range(num_reps): 37 | if is_torch: 38 | Ac = torch.from_numpy(A.copy(order="F")) 39 | else: 40 | Ac = A.copy(order="F") 41 | t_s = time.time() 42 | fn(Ac) 43 | t_e = time.time() 44 | rep_times.append(t_e - t_s) 45 | print("Exp %s - N %d - Rep %d - %.2fs" % (exp_name, num_pts, j, rep_times[-1]), flush=True) 46 | del Ac 47 | if is_torch: 48 | torch.cuda.empty_cache() 49 | timings.append(min(rep_times)) 50 | return timings 51 | 52 | 53 | if __name__ == "__main__": 54 | init_opt = falkon.FalkonOptions() 55 | torch.cuda.init() 56 | gpu_info = [v for k, v in devices.get_device_info(init_opt).items() if k >= 0] 57 | num_gpu = len(gpu_info) 58 | 59 | defaultN32 = [10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000, 100_000, 120_000, 140_000] 60 | defaultN64 = [10_000, 20_000, 30_000, 40_000, 50_000, 65_000, 80_000] 61 | falkon.FalkonOptions(chol_force_ooc=True, chol_par_blk_multiplier=2, compute_arch_speed=False) 62 | 63 | experiments: List[Dict[str, Any]] = [ 64 | { 65 | "exp_name": "Parallel 32", 66 | "exp_sizes": defaultN32, 67 | "dtype": np.float32, 68 | "num_reps": 3, 69 | "is_torch": True, 70 | "fn": functools.partial( 71 | gpu_cholesky, 72 | upper=False, 73 | clean=False, 74 | overwrite=True, 75 | opt=falkon.FalkonOptions(chol_force_ooc=True, chol_par_blk_multiplier=2), 76 | ), 77 | }, 78 | { 79 | "exp_name": "Parallel 64", 80 | "exp_sizes": defaultN64, 81 | "dtype": np.float64, 82 | "num_reps": 3, 83 | "is_torch": True, 84 | "fn": functools.partial( 85 | gpu_cholesky, 86 | upper=False, 87 | clean=False, 88 | overwrite=True, 89 | opt=falkon.FalkonOptions(chol_force_ooc=True, chol_par_blk_multiplier=2, compute_arch_speed=False), 90 | ), 91 | }, 92 | { 93 | "exp_name": "CPU 32", 94 | "exp_sizes": defaultN32, 95 | "dtype": np.float32, 96 | "num_reps": 3, 97 | "is_torch": False, 98 | "fn": functools.partial(spotrf, lower=True, clean=False, overwrite_a=True), 99 | }, 100 | { 101 | "exp_name": "CPU 64", 102 | "exp_sizes": defaultN64, 103 | "dtype": np.float64, 104 | "num_reps": 2, 105 | "is_torch": False, 106 | "fn": functools.partial(dpotrf, lower=True, clean=False, overwrite_a=True), 107 | }, 108 | ] 109 | for exp in experiments: 110 | exp_times = run_potrf_exp(**exp) 111 | exp["timings"] = exp_times 112 | with open("logs/potrf_timings_%dGPU.json" % (num_gpu), "w") as fh: 113 | json.dump(experiments, fh) 114 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/potrf_timings.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -d logs ]; then 3 | mkdir logs 4 | fi 5 | 6 | 7 | echo "Running with 1 GPU" 8 | export CUDA_VISIBLE_DEVICES="0" 9 | python potrf_timings.py 2>&1 | tee -a "logs/potrf_timings_1GPU.txt" 10 | 11 | echo "Running with 2 GPUs" 12 | export CUDA_VISIBLE_DEVICES="0,1" 13 | python potrf_timings.py 2>&1 | tee -a "logs/potrf_timings_2GPU.txt" 14 | 15 | exit 0; 16 | 17 | echo "Running with 3 GPUs" 18 | export CUDA_VISIBLE_DEVICES="0,1,2" 19 | python potrf_timings.py 2>&1 | tee -a "logs/potrf_timings_3GPU.txt" 20 | 21 | echo "Running with 4 GPUs" 22 | export CUDA_VISIBLE_DEVICES="0,1,2,3" 23 | python potrf_timings.py 2>&1 | tee -a "logs/potrf_timings_4GPU.txt" 24 | 25 | echo "Running with 5 GPUs" 26 | export CUDA_VISIBLE_DEVICES="0,1,2,3,4" 27 | python potrf_timings.py 2>&1 | tee -a "logs/potrf_timings_5GPU.txt" 28 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/time_improvements.py: -------------------------------------------------------------------------------- 1 | """ 2 | Time Falkon with different parts switched on/off on a sample dataset (MS) which is quite fast: 3 | 4 | Baseline (equivalent to FALKON MATLAB): 5 | 1. float64 + CPU Preconditioner + single GPU (no keops) 6 | 2. float32 + CPU Preconditioner + single GPU (no keops) 7 | 3. float32 + GPU Preconditioner + single GPU (no keops) 8 | 4. float32 + GPU Preconditioner + 2 GPU (no keops) 9 | 5. float32 + GPU Preconditioner + 2 GPU (keops) 10 | """ 11 | import argparse 12 | import dataclasses 13 | import functools 14 | import time 15 | 16 | import numpy as np 17 | import torch 18 | 19 | import falkon 20 | from falkon import kernels 21 | from falkon.benchmarks.common.benchmark_utils import Dataset, DataType 22 | from falkon.benchmarks.common.datasets import get_load_fn 23 | from falkon.benchmarks.common.error_metrics import get_err_fns 24 | 25 | RANDOM_SEED = 95 26 | 27 | torch.manual_seed(RANDOM_SEED) 28 | np.random.seed(RANDOM_SEED) 29 | 30 | 31 | def run(exp_num, dset, show_intermediate_errors: bool = False): 32 | opt = falkon.FalkonOptions( 33 | debug=True, 34 | pc_epsilon_32=1e-6, 35 | pc_epsilon_64=1e-13, 36 | compute_arch_speed=False, 37 | num_fmm_streams=2, 38 | no_single_kernel=False, 39 | ) 40 | params = { 41 | "seed": 12, 42 | "kernel": kernels.GaussianKernel(3.8), 43 | "penalty": 1e-7, 44 | "M": 100_000, 45 | "maxiter": 10, 46 | } 47 | if exp_num == 1: 48 | opt = dataclasses.replace(opt, cpu_preconditioner=True, keops_active="no") 49 | dtype = DataType.float64 50 | elif exp_num == 2: 51 | opt = dataclasses.replace(opt, cpu_preconditioner=True, keops_active="no") 52 | dtype = DataType.float32 53 | elif exp_num == 3: 54 | opt = dataclasses.replace(opt, cpu_preconditioner=False, keops_active="no") 55 | dtype = DataType.float32 56 | elif exp_num == 4: 57 | opt = dataclasses.replace(opt, cpu_preconditioner=False, keops_active="no") 58 | dtype = DataType.float32 59 | elif exp_num == 5: 60 | opt = dataclasses.replace(opt, cpu_preconditioner=False, keops_active="force") 61 | dtype = DataType.float32 62 | else: 63 | raise ValueError("exp num %d not valid" % (exp_num)) 64 | data = load_data(dset, data_type=dtype) 65 | torch.cuda.init() 66 | print("\n\n --- Running Experiment %d -- %s" % (exp_num, opt)) 67 | data = list(data) 68 | data[0] = data[0].pin_memory() 69 | data[1] = data[1].pin_memory() 70 | data[2] = data[2].pin_memory() 71 | data[3] = data[3].pin_memory() 72 | t_s = time.time() 73 | flk = run_single(dset, data[0], data[1], data[2], data[3], data[4], show_intermediate_errors, opt, params) 74 | t_e = time.time() 75 | print("Timing for Experiment %d: %s -- fit times %s" % (exp_num, t_e - t_s, flk.fit_times_)) 76 | 77 | 78 | def load_data(dset, data_type): 79 | load_fn = get_load_fn(dset) 80 | return load_fn(dtype=data_type.to_numpy_dtype(), as_torch=True) 81 | 82 | 83 | def run_single(dset, Xtr, Ytr, Xts, Yts, kwargs, intermediate_errors, opt, params): 84 | err_fns = get_err_fns(dset) 85 | err_fns = [functools.partial(fn, **kwargs) for fn in err_fns] 86 | error_every = 1 if intermediate_errors else None 87 | 88 | flk = falkon.Falkon(error_fn=err_fns[0], error_every=error_every, options=opt, **params) 89 | flk.fit(Xtr, Ytr, Xts, Yts) 90 | return flk 91 | 92 | 93 | if __name__ == "__main__": 94 | p = argparse.ArgumentParser(description="FALKON Benchmark Runner") 95 | p.add_argument("-i", "--exp-num", type=int, required=True, help="The experiment type, 1 to 5.") 96 | p.add_argument("-d", "--dataset", type=Dataset, choices=list(Dataset), required=True, help="Dataset") 97 | args = p.parse_args() 98 | run(args.exp_num, args.dataset, show_intermediate_errors=True) 99 | -------------------------------------------------------------------------------- /falkon/benchmarks/timing_benchmarks/time_improvements.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Prepare log file 4 | if [ ! -d logs ]; then 5 | mkdir logs 6 | fi 7 | 8 | # Prepare conda 9 | CONDA_BASE=$(conda info --base) 10 | source $CONDA_BASE/etc/profile.d/conda.sh 11 | 12 | # Common variables 13 | DSET="higgs" 14 | PY_LAUNCHER="time_improvements.py" 15 | LOG_FILE="logs/time_improvements_${DSET}.log" 16 | 17 | conda activate torch 18 | 19 | export CUDA_VISIBLE_DEVICES="0" 20 | PYTHONPATH='..' python $PY_LAUNCHER --exp-num 1 --dataset $DSET 2>&1 | tee -a $LOG_FILE 21 | export CUDA_VISIBLE_DEVICES="0" 22 | PYTHONPATH='..' python $PY_LAUNCHER --exp-num 2 --dataset $DSET 2>&1 | tee -a $LOG_FILE 23 | export CUDA_VISIBLE_DEVICES="0" 24 | PYTHONPATH='..' python $PY_LAUNCHER --exp-num 3 --dataset $DSET 2>&1 | tee -a $LOG_FILE 25 | export CUDA_VISIBLE_DEVICES="0,1" 26 | PYTHONPATH='..' python $PY_LAUNCHER --exp-num 4 --dataset $DSET 2>&1 | tee -a $LOG_FILE 27 | export CUDA_VISIBLE_DEVICES="0,1" 28 | PYTHONPATH='..' python $PY_LAUNCHER --exp-num 5 --dataset $DSET 2>&1 | tee -a $LOG_FILE 29 | 30 | conda deactivate -------------------------------------------------------------------------------- /falkon/c_ext/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Taken from nerfacc (https://github.com/KAIR-BAIR/nerfacc) (MIT Licence) 3 | 4 | Copyright (c) 2022 Ruilong Li, UC Berkeley. 5 | Copyright (c) 2023 Giacomo Meanti 6 | """ 7 | 8 | from typing import Callable 9 | 10 | import torch 11 | 12 | 13 | def _make_lazy_cuda_func(name: str) -> Callable: 14 | def call_cuda(*args, **kwargs): 15 | from ._backend import _assert_has_ext 16 | 17 | _assert_has_ext() 18 | return getattr(torch.ops.falkon, name)(*args, **kwargs) 19 | 20 | return call_cuda 21 | 22 | 23 | # Custom la functions 24 | parallel_potrf = _make_lazy_cuda_func("parallel_potrf") 25 | lauum_cuda = _make_lazy_cuda_func("lauum") 26 | 27 | # Triangular helpers 28 | copy_triang = _make_lazy_cuda_func("copy_triang") 29 | mul_triang = _make_lazy_cuda_func("mul_triang") 30 | copy_transpose = _make_lazy_cuda_func("copy_transpose") 31 | vec_mul_triang = _make_lazy_cuda_func("vec_mul_triang") 32 | 33 | # Sparse matrices 34 | spspmm = _make_lazy_cuda_func("spspmm") 35 | csr2dense = _make_lazy_cuda_func("csr2dense") 36 | sparse_row_norm_sq = _make_lazy_cuda_func("sparse_square_norm") 37 | sparse_row_norm = _make_lazy_cuda_func("sparse_norm") 38 | sparse_bdot = _make_lazy_cuda_func("sparse_bdot") 39 | 40 | # Square norm with autograd 41 | square_norm = _make_lazy_cuda_func("square_norm") 42 | 43 | # Wrappers 44 | cublas_2d_copy_to_dev_async = _make_lazy_cuda_func("cublas_2d_copy_to_dev_async") 45 | cublas_2d_copy_to_dev = _make_lazy_cuda_func("cublas_2d_copy_to_dev") 46 | cublas_2d_copy_to_host_async = _make_lazy_cuda_func("cublas_2d_copy_to_host_async") 47 | cublas_2d_copy_to_host = _make_lazy_cuda_func("cublas_2d_copy_to_host") 48 | cuda_2d_copy_async = _make_lazy_cuda_func("cuda_2d_copy_async") 49 | cuda_2d_copy = _make_lazy_cuda_func("cuda_2d_copy") 50 | cuda_1d_copy_async = _make_lazy_cuda_func("cuda_1d_copy_async") 51 | cuda_1d_copy = _make_lazy_cuda_func("cuda_1d_copy") 52 | mem_get_info = _make_lazy_cuda_func("mem_get_info") 53 | cusolver_potrf_buffer_size = _make_lazy_cuda_func("cusolver_potrf_buffer_size") 54 | cusolver_potrf = _make_lazy_cuda_func("cusolver_potrf") 55 | potrf = _make_lazy_cuda_func("potrf") 56 | cublas_trsm = _make_lazy_cuda_func("cublas_trsm") 57 | cublas_trmm = _make_lazy_cuda_func("cublas_trmm") 58 | cublas_gemm = _make_lazy_cuda_func("cublas_gemm") 59 | cublas_syrk = _make_lazy_cuda_func("cublas_syrk") 60 | cuda_version = _make_lazy_cuda_func("_cuda_version") 61 | -------------------------------------------------------------------------------- /falkon/c_ext/falkon.cpp: -------------------------------------------------------------------------------- 1 | #include "falkon.h" 2 | 3 | #include 4 | 5 | #ifdef WITH_CUDA 6 | #include 7 | #endif 8 | 9 | namespace falkon { 10 | int64_t cuda_version() { 11 | #ifdef WITH_CUDA 12 | return CUDA_VERSION; 13 | #else 14 | return -1; 15 | #endif 16 | } 17 | 18 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 19 | m.def("_cuda_version", &cuda_version); 20 | } 21 | } // namespace falkon 22 | -------------------------------------------------------------------------------- /falkon/c_ext/falkon.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | int64_t cuda_version(); 7 | 8 | } // namespace falkon 9 | 10 | 11 | /* 12 | * schema help: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md 13 | */ 14 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/autograd/ag_square_norm.cpp: -------------------------------------------------------------------------------- 1 | #include "../square_norm.h" 2 | 3 | #include 4 | #include 5 | 6 | namespace falkon { 7 | namespace ops { 8 | namespace { 9 | 10 | class SquareNormFunction 11 | : public torch::autograd::Function { 12 | public: 13 | static torch::autograd::variable_list forward( 14 | torch::autograd::AutogradContext *ctx, 15 | const torch::autograd::Variable& input, 16 | int64_t dim, 17 | bool keepdim) { 18 | at::AutoDispatchBelowADInplaceOrView g; 19 | auto output = square_norm(input, dim, keepdim); 20 | 21 | ctx->save_for_backward({input}); 22 | ctx->saved_data["dim"] = dim; 23 | ctx->saved_data["keepdim"] = keepdim; 24 | 25 | return { 26 | output, 27 | }; 28 | } 29 | static torch::autograd::variable_list backward( 30 | torch::autograd::AutogradContext* ctx, 31 | const torch::autograd::variable_list& grad_output) { 32 | auto input = ctx->get_saved_variables()[0]; 33 | 34 | auto dim = ctx->saved_data["dim"].toInt(); 35 | auto keepdim = ctx->saved_data["keepdim"].toBool(); 36 | 37 | auto grad_out = grad_output[0]; 38 | 39 | if (!keepdim) { 40 | grad_out = grad_out.unsqueeze(dim); 41 | } 42 | auto grad_input = input * 2; 43 | grad_input.mul_(grad_out); 44 | 45 | return { 46 | grad_input, 47 | torch::autograd::Variable(), 48 | torch::autograd::Variable() 49 | }; 50 | } 51 | }; 52 | 53 | at::Tensor square_norm_autograd( 54 | const at::Tensor& input, 55 | int64_t dim, 56 | bool keepdim) { 57 | return SquareNormFunction::apply(input, dim, keepdim)[0]; 58 | } 59 | 60 | } // namespace 61 | 62 | TORCH_LIBRARY_IMPL(falkon, Autograd, m) { 63 | m.impl( 64 | TORCH_SELECTIVE_NAME("falkon::square_norm"), 65 | TORCH_FN(square_norm_autograd)); 66 | } 67 | 68 | } // namespace ops 69 | } // namespace falkon 70 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/copy_transpose.cpp: -------------------------------------------------------------------------------- 1 | #include "copy_transpose.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor copy_transpose( 11 | const at::Tensor &self, 12 | at::Tensor &out) { 13 | static auto op = c10::Dispatcher::singleton() 14 | .findSchemaOrThrow("falkon::copy_transpose", "") 15 | .typed(); 16 | at::AutoDispatchBelowAutograd guard; 17 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 18 | return op.call( 19 | self, 20 | out 21 | ); 22 | } 23 | 24 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 25 | m.def(TORCH_SELECTIVE_SCHEMA( 26 | "falkon::copy_transpose(Tensor self, Tensor(a!) out) -> Tensor(a!)")); 27 | } 28 | 29 | } // namespace ops 30 | } // namespace falkon 31 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/copy_transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor copy_transpose( 9 | const at::Tensor &self, 10 | at::Tensor &out); 11 | 12 | } // namespace ops 13 | } // namespace falkon 14 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/copy_triang.cpp: -------------------------------------------------------------------------------- 1 | #include "copy_triang.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor copy_triang( 11 | at::Tensor &self, 12 | const bool upper) { 13 | static auto op = c10::Dispatcher::singleton() 14 | .findSchemaOrThrow("falkon::copy_triang", "") 15 | .typed(); 16 | at::AutoDispatchBelowAutograd guard; 17 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 18 | return op.call( 19 | self, 20 | upper 21 | ); 22 | } 23 | 24 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 25 | m.def(TORCH_SELECTIVE_SCHEMA( 26 | "falkon::copy_triang(Tensor(a!) self, bool upper) -> Tensor(a!)")); 27 | } 28 | 29 | } // namespace ops 30 | } // namespace falkon 31 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/copy_triang.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor copy_triang( 9 | at::Tensor &self, 10 | const bool upper); 11 | 12 | } // namespace ops 13 | } // namespace falkon 14 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cpu/cpu_copy_triang.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../helpers.h" 6 | 7 | namespace falkon { 8 | namespace ops { 9 | namespace { 10 | 11 | template 12 | void copy_triang_impl(scalar_t *mat, const int n, const int stride1, const int stride2, const bool upper) { 13 | // assume input is f-contiguous (contiguous columns, stride1 == 1) 14 | if (upper) { 15 | at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) { 16 | for (int64_t i : c10::irange(start, end)) { 17 | for (int64_t j = 0; j < i; j++) { 18 | // mat[i, j] = mat[j, i] 19 | mat[i * stride1 + j * stride2] = mat[j * stride1 + i * stride2]; 20 | } 21 | } 22 | }); 23 | } else { 24 | at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) { 25 | for (int64_t i : c10::irange(start, end)) { 26 | for (int64_t j = i + 1; j < n; j++) { 27 | // mat[i, j] = mat[j, i] 28 | mat[i * stride1 + j * stride2] = mat[j * stride1 + i * stride2]; 29 | } 30 | } 31 | }); 32 | } 33 | } 34 | 35 | at::Tensor copy_triang_kernel( 36 | at::Tensor &mat, 37 | const bool upper) { 38 | AT_ASSERTM(mat.dim() == 2, "Input matrix must be 2D"); 39 | const int64_t n = mat.size(0); 40 | const int64_t m = mat.size(1); 41 | TORCH_CHECK( 42 | (n == m), 43 | "Input matrix must be square. Found shape: (", 44 | n, 45 | ", ", 46 | m, 47 | ")"); 48 | int64_t row_stride = mat.stride(0); 49 | int64_t col_stride = mat.stride(1); 50 | TORCH_CHECK( 51 | (row_stride == 1 || col_stride == 1), 52 | "Input must be contiguous in one dimension. Found strides: (", 53 | row_stride, 54 | ", ", 55 | col_stride, 56 | ")"); 57 | 58 | bool bupper = upper; 59 | if (!is_fortran_contig(mat)) { 60 | bupper = !upper; 61 | int64_t tmp = row_stride; 62 | row_stride = col_stride; 63 | col_stride = tmp; 64 | } 65 | AT_DISPATCH_FLOATING_TYPES(mat.scalar_type(), "copy_triang", [&] { 66 | copy_triang_impl( 67 | mat.data_ptr(), 68 | n, 69 | row_stride, 70 | col_stride, 71 | bupper 72 | ); 73 | }); 74 | return mat; 75 | } 76 | 77 | } // namespace 78 | 79 | TORCH_LIBRARY_IMPL(falkon, CPU, m) { 80 | m.impl( 81 | TORCH_SELECTIVE_NAME("falkon::copy_triang"), 82 | TORCH_FN(copy_triang_kernel)); 83 | } 84 | 85 | } // namespace ops 86 | } // namespace falkon 87 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cpu/cpu_mul_triang.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | namespace { 8 | 9 | // TODO: Parallelize 10 | template 11 | void mul_upper_diag( 12 | scalar_t *data, 13 | const int64_t size, 14 | const scalar_t mul, 15 | const int64_t row_stride, 16 | const int64_t col_stride, 17 | const bool preserve_diag) { 18 | const int diagonal_offset = preserve_diag ? 1 : 0; 19 | for (int64_t i = 0; i < size; i++) { 20 | for (int64_t j = i + diagonal_offset; j < size; j++) { 21 | data[i * row_stride + j * col_stride] *= mul; 22 | } 23 | } 24 | } 25 | 26 | template 27 | void mul_lower_diag( 28 | scalar_t *data, 29 | const int64_t size, 30 | const scalar_t mul, 31 | const int64_t row_stride, 32 | const int64_t col_stride, 33 | const bool preserve_diag) { 34 | const int diagonal_offset = preserve_diag ? -1 : 0; 35 | for (int64_t i = 0; i < size; i++) { 36 | for (int64_t j = 0; j <= (i + diagonal_offset); j++) { 37 | data[i * row_stride + j * col_stride] *= mul; 38 | } 39 | } 40 | } 41 | 42 | 43 | at::Tensor mul_triang_kernel( 44 | at::Tensor &mat, 45 | const double multiplier, 46 | const bool upper, 47 | const bool preserve_diag) { 48 | AT_ASSERTM(mat.dim() == 2, "mat must be 2D"); 49 | const int64_t n = mat.size(0); 50 | const int64_t m = mat.size(1); 51 | TORCH_CHECK( 52 | (n == m), 53 | "Input matrix must be square. Found shape: (", 54 | n, 55 | ", ", 56 | m, 57 | ")"); 58 | int64_t row_stride = mat.stride(0); 59 | int64_t col_stride = mat.stride(1); 60 | TORCH_CHECK( 61 | (row_stride == 1 || col_stride == 1), 62 | "Input must be contiguous in one dimension. Found strides: (", 63 | row_stride, 64 | ", ", 65 | col_stride, 66 | ")"); 67 | 68 | bool bupper = upper; 69 | if (row_stride == 1) { 70 | bupper = !upper; 71 | int64_t tmp_stride = row_stride; 72 | row_stride = col_stride; 73 | col_stride = tmp_stride; 74 | } 75 | 76 | AT_DISPATCH_FLOATING_TYPES(mat.scalar_type(), "mul_triang", [&] { 77 | const scalar_t mul = (scalar_t)multiplier; 78 | if (bupper) { 79 | mul_upper_diag( 80 | mat.data_ptr(), 81 | n, 82 | mul, 83 | row_stride, 84 | col_stride, 85 | preserve_diag); 86 | } else { 87 | mul_lower_diag( 88 | mat.data_ptr(), 89 | n, 90 | mul, 91 | row_stride, 92 | col_stride, 93 | preserve_diag); 94 | } 95 | }); 96 | return mat; 97 | } 98 | 99 | 100 | } // namespace 101 | 102 | TORCH_LIBRARY_IMPL(falkon, CPU, m) { 103 | m.impl( 104 | TORCH_SELECTIVE_NAME("falkon::mul_triang"), 105 | TORCH_FN(mul_triang_kernel)); 106 | } 107 | 108 | } // namespace ops 109 | } // namespace falkon 110 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cpu/cpu_potrf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../helpers.h" 6 | #include "../mul_triang.h" 7 | 8 | namespace falkon { 9 | namespace ops { 10 | namespace { 11 | 12 | at::Tensor potrf_kernel( 13 | at::Tensor &mat, 14 | bool upper, 15 | bool clean, 16 | bool overwrite) { 17 | AT_ASSERTM(mat.dim() == 2, "Input matrix must be 2D"); 18 | const int64_t n = mat.size(0); 19 | const int64_t m = mat.size(1); 20 | TORCH_CHECK( 21 | (n == m), 22 | "Input matrix must be square. Found shape: (", 23 | n, 24 | ", ", 25 | m, 26 | ")"); 27 | int64_t row_stride = mat.stride(0); 28 | int64_t col_stride = mat.stride(1); 29 | TORCH_CHECK( 30 | (row_stride == 1 || col_stride == 1), 31 | "Input must be contiguous in one dimension. Found strides: (", 32 | row_stride, 33 | ", ", 34 | col_stride, 35 | ")"); 36 | 37 | char uplo; 38 | if (is_fortran_contig(mat)) { 39 | uplo = upper ? 'U' : 'L'; 40 | } else { 41 | uplo = upper ? 'L' : 'U'; 42 | } 43 | 44 | // Copy array if necessary 45 | if (!overwrite) { 46 | mat = mat.clone(); 47 | } 48 | 49 | int info = 0; 50 | 51 | AT_DISPATCH_FLOATING_TYPES(mat.scalar_type(), "copy_triang", [&] { 52 | at::native::lapackCholesky( 53 | uplo, 54 | n, 55 | mat.data_ptr(), 56 | row_stride == 1 ? col_stride : row_stride, 57 | &info); 58 | TORCH_CHECK( 59 | (info == 0), 60 | "LAPACK potrf failed with status ", 61 | info, 62 | ". Params: uplo ", 63 | uplo, 64 | ", rows ", 65 | n); 66 | // Clean non-factorized part of the matrix 67 | }); 68 | if (clean) { 69 | falkon::ops::mul_triang(mat, 0.0, !upper, true); 70 | } 71 | return mat; 72 | } 73 | 74 | 75 | } // namespace 76 | 77 | TORCH_LIBRARY_IMPL(falkon, CPU, m) { 78 | m.impl( 79 | TORCH_SELECTIVE_NAME("falkon::potrf"), 80 | TORCH_FN(potrf_kernel)); 81 | } 82 | 83 | } // namespace ops 84 | } // namespace falkon 85 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cpu/cpu_square_norm.cpp: -------------------------------------------------------------------------------- 1 | #include "../helpers.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace falkon { 10 | namespace ops { 11 | namespace { 12 | 13 | template 14 | struct NormTwoSquareOpsCPU { 15 | inline C10_DEVICE acc_t reduce(acc_t acc, acc_t data, int64_t /*idx*/) const { 16 | return acc + data * data; 17 | } 18 | 19 | inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { 20 | return a + b; 21 | } 22 | 23 | inline C10_DEVICE acc_t project(acc_t a) const { 24 | return a; 25 | } 26 | 27 | static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { 28 | return acc; 29 | } 30 | }; 31 | 32 | template 33 | void square_vector_norm_impl(at::TensorIterator iter) { 34 | if (iter.numel() == 0) { 35 | iter.output().fill_(0); 36 | return; 37 | } 38 | at::native::binary_kernel_reduce(iter, NormTwoSquareOpsCPU(), (scalar_t)0.0); 39 | } 40 | 41 | at::Tensor square_norm_kernel(const at::Tensor &input, int64_t dim, bool keepdim) { 42 | at::IntArrayRef dimArr = at::IntArrayRef(dim); 43 | at::ScalarType in_dtype = input.scalar_type(); 44 | 45 | // Create the output tensor 46 | auto result_shape = shape_from_dim(input, dim, keepdim); 47 | at::Tensor result = at::empty(result_shape, input.options()); 48 | at::TensorIterator iter = at::native::make_reduction("vector_sqnorm", result, input, dimArr, keepdim, in_dtype); 49 | AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "square_vector_norm_impl", [&] { 50 | square_vector_norm_impl(iter); 51 | }); 52 | return result; 53 | } 54 | 55 | } // namespace 56 | 57 | TORCH_LIBRARY_IMPL(falkon, CPU, m) { 58 | m.impl( 59 | TORCH_SELECTIVE_NAME("falkon::square_norm"), 60 | TORCH_FN(square_norm_kernel)); 61 | } 62 | 63 | } // namespace ops 64 | } // namespace falkon 65 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/csr2dense.cpp: -------------------------------------------------------------------------------- 1 | #include "csr2dense.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor csr2dense( 11 | const at::Tensor &rowptr, 12 | const at::Tensor &col, 13 | const at::Tensor &val, 14 | at::Tensor &out) { 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::csr2dense", "") 17 | .typed(); 18 | at::AutoDispatchBelowAutograd guard; 19 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 20 | return op.call( 21 | rowptr, 22 | col, 23 | val, 24 | out 25 | ); 26 | } 27 | 28 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 29 | m.def(TORCH_SELECTIVE_SCHEMA( 30 | "falkon::csr2dense(Tensor rowptr, Tensor col, Tensor val, Tensor(a!) out) -> Tensor(a!)")); 31 | } 32 | 33 | } // namespace ops 34 | } // namespace falkon 35 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/csr2dense.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor csr2dense( 9 | const at::Tensor &rowptr, 10 | const at::Tensor &col, 11 | const at::Tensor &val, 12 | at::Tensor &out); 13 | 14 | } // namespace ops 15 | } // namespace falkon 16 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cublas_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | void cublas_2d_copy_to_dev_async ( 8 | const int64_t rows, 9 | const int64_t cols, 10 | const int64_t elemSize, 11 | const at::Tensor& host_tensor, 12 | const int64_t lda, 13 | at::Tensor& dev_tensor, 14 | const int64_t ldb); 15 | void cublas_2d_copy_to_dev ( 16 | const int64_t rows, 17 | const int64_t cols, 18 | const int64_t elemSize, 19 | const at::Tensor& host_tensor, 20 | const int64_t lda, 21 | at::Tensor& dev_tensor, 22 | const int64_t ldb); 23 | void cublas_2d_copy_to_host_async( 24 | const int64_t rows, 25 | const int64_t cols, 26 | const int64_t elemSize, 27 | const at::Tensor& dev_tensor, 28 | const int64_t lda, 29 | at::Tensor& host_tensor, 30 | const int64_t ldb); 31 | void cublas_2d_copy_to_host( 32 | const int64_t rows, 33 | const int64_t cols, 34 | const int64_t elemSize, 35 | const at::Tensor& dev_tensor, 36 | const int64_t lda, 37 | at::Tensor& host_tensor, 38 | const int64_t ldb); 39 | void cublas_trsm( 40 | const at::Tensor& A, 41 | at::Tensor& B, 42 | const at::Scalar& alpha, 43 | bool left, 44 | bool upper, 45 | bool transpose, 46 | bool unitriangular, 47 | int64_t m, 48 | int64_t n, 49 | int64_t lda, 50 | int64_t ldb); 51 | void cublas_trmm( 52 | const at::Tensor& A, 53 | const at::Tensor& B, 54 | at::Tensor& C, 55 | bool left, 56 | bool upper, 57 | bool transpose, 58 | bool unitriangular, 59 | const at::Scalar& alpha, 60 | int64_t m, 61 | int64_t n, 62 | int64_t lda, 63 | int64_t ldb, 64 | int64_t ldc); 65 | void cublas_gemm( 66 | const at::Tensor& A, 67 | int64_t lda, 68 | bool transa, 69 | const at::Tensor& B, 70 | int64_t ldb, 71 | bool transb, 72 | at::Tensor& C, 73 | int64_t ldc, 74 | int64_t m, 75 | int64_t n, 76 | int64_t k, 77 | const at::Scalar& alpha, 78 | const at::Scalar& beta); 79 | void cublas_syrk( 80 | const at::Tensor& A, 81 | int64_t lda, 82 | at::Tensor& C, 83 | int64_t ldc, 84 | const at::Scalar& alpha, 85 | const at::Scalar& beta, 86 | bool upper, 87 | bool transpose, 88 | int64_t n, 89 | int64_t k); 90 | } // namespace ops 91 | } // namespace falkon 92 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cublas_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | template 9 | void trsm( 10 | cublasHandle_t cublas_handle, 11 | cublasSideMode_t side, 12 | cublasFillMode_t uplo, 13 | cublasOperation_t trans, 14 | cublasDiagType_t diag, 15 | int m, 16 | int n, 17 | const scalar_t *alpha, 18 | const scalar_t *A, 19 | int lda, 20 | scalar_t *B, 21 | int ldb); 22 | 23 | template 24 | void trmm( 25 | cublasHandle_t cublas_handle, 26 | cublasSideMode_t side, 27 | cublasFillMode_t uplo, 28 | cublasOperation_t trans, 29 | cublasDiagType_t diag, 30 | int m, 31 | int n, 32 | const scalar_t *alpha, 33 | const scalar_t *A, 34 | int lda, 35 | const scalar_t *B, 36 | int ldb, 37 | scalar_t *C, 38 | int ldc); 39 | 40 | template 41 | void gemm( 42 | cublasHandle_t cublas_handle, 43 | cublasOperation_t transa, 44 | cublasOperation_t transb, 45 | int m, 46 | int n, 47 | int k, 48 | const scalar_t *alpha, 49 | const scalar_t *A, 50 | int lda, 51 | const scalar_t *B, 52 | int ldb, 53 | const scalar_t *beta, 54 | scalar_t *C, 55 | int ldc); 56 | 57 | template 58 | void syrk( 59 | cublasHandle_t cublas_handle, 60 | cublasFillMode_t uplo, 61 | cublasOperation_t trans, 62 | int n, 63 | int k, 64 | const scalar_t *alpha, 65 | const scalar_t *A, 66 | int lda, 67 | const scalar_t *beta, 68 | scalar_t *C, 69 | int ldc); 70 | 71 | } // namespace ops 72 | } // namespace falkon 73 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cuda_bindings.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../helpers.h" 9 | 10 | namespace falkon { 11 | namespace ops { 12 | 13 | namespace { 14 | 15 | std::tuple mem_get_info(int64_t device_id) { 16 | const at::cuda::CUDAGuard device_guard(device_id); 17 | size_t free; 18 | size_t total; 19 | C10_CUDA_CHECK(cudaMemGetInfo(&free, &total)); 20 | return std::tuple{ 21 | (int64_t)free, 22 | (int64_t)total 23 | }; 24 | } 25 | 26 | void cuda_2d_copy_async( 27 | at::Tensor& dest_tensor, 28 | const int64_t dest_pitch, 29 | const at::Tensor& src_tensor, 30 | const int64_t src_pitch, 31 | const int64_t width, 32 | const int64_t height) { 33 | at::cuda::CUDAStream torch_stream = at::cuda::getCurrentCUDAStream(at::cuda::current_device()); 34 | C10_CUDA_CHECK(cudaMemcpy2DAsync( 35 | dest_tensor.data_ptr(), 36 | dest_pitch, 37 | src_tensor.data_ptr(), 38 | src_pitch, 39 | width, 40 | height, 41 | cudaMemcpyDefault, 42 | torch_stream.stream() 43 | )); 44 | } 45 | 46 | void cuda_2d_copy( 47 | at::Tensor& dest_tensor, 48 | const int64_t dest_pitch, 49 | const at::Tensor& src_tensor, 50 | const int64_t src_pitch, 51 | const int64_t width, 52 | const int64_t height) { 53 | C10_CUDA_CHECK(cudaMemcpy2D( 54 | dest_tensor.data_ptr(), 55 | dest_pitch, 56 | src_tensor.data_ptr(), 57 | src_pitch, 58 | width, 59 | height, 60 | cudaMemcpyDefault 61 | )); 62 | } 63 | 64 | void cuda_1d_copy_async( 65 | at::Tensor& dest_tensor, 66 | const at::Tensor &src_tensor, 67 | const int64_t count) { 68 | at::cuda::CUDAStream torch_stream = at::cuda::getCurrentCUDAStream(at::cuda::current_device()); 69 | C10_CUDA_CHECK(cudaMemcpyAsync( 70 | dest_tensor.data_ptr(), 71 | src_tensor.data_ptr(), 72 | count, 73 | cudaMemcpyDefault, 74 | torch_stream.stream() 75 | )); 76 | } 77 | 78 | void cuda_1d_copy( 79 | at::Tensor& dest_tensor, 80 | const at::Tensor &src_tensor, 81 | const int64_t count) { 82 | C10_CUDA_CHECK(cudaMemcpy( 83 | dest_tensor.data_ptr(), 84 | src_tensor.data_ptr(), 85 | count, 86 | cudaMemcpyDefault 87 | )); 88 | } 89 | 90 | } // namespace 91 | 92 | // registered as catch-all function since it has no tensor inputs, and dispatcher doesn't know what to do 93 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 94 | m.def("falkon::mem_get_info", &mem_get_info); 95 | } 96 | 97 | TORCH_LIBRARY_IMPL(falkon, CUDA, m) { 98 | // m.impl( 99 | // TORCH_SELECTIVE_NAME("falkon::mem_get_info"), 100 | // TORCH_FN(mem_get_info)); 101 | m.impl( 102 | TORCH_SELECTIVE_NAME("falkon::cuda_2d_copy_async"), 103 | TORCH_FN(cuda_2d_copy_async)); 104 | m.impl( 105 | TORCH_SELECTIVE_NAME("falkon::cuda_2d_copy"), 106 | TORCH_FN(cuda_2d_copy)); 107 | m.impl( 108 | TORCH_SELECTIVE_NAME("falkon::cuda_1d_copy_async"), 109 | TORCH_FN(cuda_1d_copy_async)); 110 | m.impl( 111 | TORCH_SELECTIVE_NAME("falkon::cuda_1d_copy"), 112 | TORCH_FN(cuda_1d_copy)); 113 | } 114 | 115 | } // ops 116 | } // falkon 117 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cuda_copy_triang.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../helpers.h" 8 | 9 | 10 | namespace falkon { 11 | namespace ops { 12 | namespace { 13 | 14 | #define NB 64 15 | 16 | /* 17 | Matrix is size * size (no support for different size than stride). 18 | Columns are contiguous. 19 | The size * size grid is subdivided into NB * size blocks (of rows). 20 | Each block has NB threads, so each thread copies one row into one 21 | column (transpose). 22 | Not a particularly efficient implementation! 23 | */ 24 | template 25 | __global__ void copy_simple_kernel_lower( 26 | scalar_t* __restrict__ data, 27 | const size_t size) { 28 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (i < size) { 30 | int col_pos = i * size; 31 | for (int row_pos = i; row_pos < i + i * size; row_pos += size) { 32 | data[col_pos] = data[row_pos]; 33 | col_pos++; 34 | } 35 | } 36 | } 37 | 38 | // Same as the _lower version, but we copy dataT to data instead! 39 | template 40 | __global__ void copy_simple_kernel_upper( 41 | scalar_t* __restrict__ data, 42 | const size_t size) { 43 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 44 | if (i < size) { 45 | int col_pos = i * size; 46 | for (int row_pos = i; row_pos < i + i * size; row_pos += size) { 47 | data[row_pos] = data[col_pos]; 48 | col_pos++; 49 | } 50 | } 51 | } 52 | 53 | at::Tensor copy_triang_kernel( 54 | at::Tensor &A, 55 | const bool upper) { 56 | CHECK_CUDA(A); 57 | TORCH_CHECK(A.size(0) == A.size(1), "A must be a square 2D matrix."); 58 | 59 | // Transpose matrix, and flip upper if matrix is C-contiguous. 60 | const bool fContig = is_fortran_contig(A); 61 | if (!fContig) 62 | A = at::transpose(A, 0, 1); 63 | const bool bupper = fContig ? upper : !upper; 64 | const int64_t nx = A.size(0); 65 | const dim3 dimGrid(ceildiv(nx, NB)); 66 | const dim3 dimBlock(NB); 67 | 68 | AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "dispatch_copy_triang", [&] { 69 | at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); 70 | at::DeviceGuard g(A.device()); 71 | if (bupper) { 72 | copy_simple_kernel_upper<<>>(A.data_ptr(), nx); 73 | } else { 74 | copy_simple_kernel_lower<<>>(A.data_ptr(), nx); 75 | } 76 | }); 77 | 78 | if (!fContig) 79 | A = at::transpose(A, 0, 1); 80 | return A; 81 | } 82 | 83 | } // namespace 84 | 85 | TORCH_LIBRARY_IMPL(falkon, CUDA, m) { 86 | m.impl( 87 | TORCH_SELECTIVE_NAME("falkon::copy_triang"), 88 | TORCH_FN(copy_triang_kernel)); 89 | } 90 | 91 | } // namespace ops 92 | } // namespace falkon 93 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cuda_helpers.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | #define FLK_CUSOLVER_CHECK(EXPR) \ 11 | do { \ 12 | cusolverStatus_t __err = EXPR; \ 13 | TORCH_CHECK(__err == CUSOLVER_STATUS_SUCCESS, \ 14 | "CUDA error: ", \ 15 | cusolverGetErrorString(__err), \ 16 | " when calling `" #EXPR "`"); \ 17 | } while (0) 18 | 19 | 20 | static const char* cusolverGetErrorString(cusolverStatus_t error) { 21 | if (error == CUSOLVER_STATUS_SUCCESS) { 22 | return "CUBLAS_STATUS_SUCCESS"; 23 | } 24 | if (error == CUSOLVER_STATUS_NOT_INITIALIZED) { 25 | return "CUSOLVER_STATUS_NOT_INITIALIZED"; 26 | } 27 | if (error == CUSOLVER_STATUS_ALLOC_FAILED) { 28 | return "CUSOLVER_STATUS_ALLOC_FAILED"; 29 | } 30 | if (error == CUSOLVER_STATUS_INVALID_VALUE) { 31 | return "CUSOLVER_STATUS_INVALID_VALUE"; 32 | } 33 | if (error == CUSOLVER_STATUS_ARCH_MISMATCH) { 34 | return "CUSOLVER_STATUS_ARCH_MISMATCH"; 35 | } 36 | if (error == CUSOLVER_STATUS_EXECUTION_FAILED) { 37 | return "CUSOLVER_STATUS_EXECUTION_FAILED"; 38 | } 39 | if (error == CUSOLVER_STATUS_INTERNAL_ERROR) { 40 | return "CUSOLVER_STATUS_INTERNAL_ERROR"; 41 | } 42 | if (error == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED) { 43 | return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 44 | } 45 | return ""; 46 | } 47 | 48 | 49 | #define FLK_CUDABLAS_CHECK(EXPR) \ 50 | do { \ 51 | cublasStatus_t __err = EXPR; \ 52 | TORCH_CHECK(__err == CUBLAS_STATUS_SUCCESS, \ 53 | "CuBLAS error: ", \ 54 | cublasGetErrorString(__err), \ 55 | " when calling `" #EXPR "`"); \ 56 | } while (0) 57 | 58 | 59 | static const char* cublasGetErrorString(cublasStatus_t error) { 60 | if (error == CUBLAS_STATUS_SUCCESS) { 61 | return "CUBLAS_STATUS_SUCCESS"; 62 | } 63 | if (error == CUBLAS_STATUS_NOT_INITIALIZED) { 64 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 65 | } 66 | if (error == CUBLAS_STATUS_ALLOC_FAILED) { 67 | return "CUBLAS_STATUS_ALLOC_FAILED"; 68 | } 69 | if (error == CUBLAS_STATUS_INVALID_VALUE) { 70 | return "CUBLAS_STATUS_INVALID_VALUE"; 71 | } 72 | if (error == CUBLAS_STATUS_ARCH_MISMATCH) { 73 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 74 | } 75 | if (error == CUBLAS_STATUS_MAPPING_ERROR) { 76 | return "CUBLAS_STATUS_MAPPING_ERROR"; 77 | } 78 | if (error == CUBLAS_STATUS_EXECUTION_FAILED) { 79 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 80 | } 81 | if (error == CUBLAS_STATUS_INTERNAL_ERROR) { 82 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 83 | } 84 | if (error == CUBLAS_STATUS_NOT_SUPPORTED) { 85 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 86 | } 87 | #ifdef CUBLAS_STATUS_LICENSE_ERROR 88 | if (error == CUBLAS_STATUS_LICENSE_ERROR) { 89 | return "CUBLAS_STATUS_LICENSE_ERROR"; 90 | } 91 | #endif 92 | return ""; 93 | } 94 | 95 | 96 | inline __device__ int2 tri_index_lower(const int linear_index) { 97 | const int row = (int)((-1 + sqrt((double)(8*linear_index + 1))) / 2.0); 98 | return make_int2( 99 | linear_index - row * (row + 1) / 2, 100 | row 101 | ); 102 | } 103 | 104 | 105 | inline __device__ int2 tri_index_upper(const int linear_index) { 106 | const int row = (int)((-1 + sqrt((double)(8*linear_index + 1))) / 2.0); 107 | return make_int2( 108 | row, 109 | linear_index - row * (row + 1) / 2 110 | ); 111 | } 112 | 113 | } // namespace ops 114 | } // namespace falkon 115 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cuda_mul_triang.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "../helpers.h" 7 | 8 | namespace falkon { 9 | namespace ops { 10 | namespace { 11 | 12 | #define NB 64 13 | 14 | template 15 | __global__ void mul_upper_diag(scalar_t *data, const size_t size, const scalar_t mul) 16 | { 17 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 18 | 19 | if (i < size) { 20 | data += i * size; 21 | const scalar_t *diag_stop = data + i; 22 | while (data <= diag_stop) { 23 | *data *= mul; 24 | data++; 25 | } 26 | } 27 | } 28 | 29 | template 30 | __global__ void mul_upper(scalar_t *data, const size_t size, const scalar_t mul) 31 | { 32 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 33 | 34 | if (i < size) { 35 | data += i * size; 36 | const scalar_t *diag_stop = data + i; 37 | while (data < diag_stop) { 38 | *data *= mul; 39 | data++; 40 | } 41 | } 42 | } 43 | 44 | template 45 | __global__ void mul_lower_diag(scalar_t *data, const size_t size, const scalar_t mul) 46 | { 47 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 48 | 49 | if (i < size) { 50 | data += i * size + i; 51 | const scalar_t *diag_stop = data + size - i; 52 | while (data < diag_stop) { 53 | *data *= mul; 54 | data++; 55 | } 56 | } 57 | } 58 | 59 | template 60 | __global__ void mul_lower(scalar_t* __restrict__ data, const size_t size, const scalar_t mul) 61 | { 62 | const int i = blockIdx.x * blockDim.x + threadIdx.x; 63 | 64 | if (i < size) { 65 | data += i * size + i; 66 | const scalar_t* diag_stop = data + size - i; 67 | data++; // Avoid touching the diagonal 68 | while (data < diag_stop) { 69 | *data *= mul; 70 | data++; 71 | } 72 | } 73 | } 74 | 75 | 76 | at::Tensor mul_triang_kernel(at::Tensor &mat, const double multiplier, const bool upper, const bool preserve_diag) { 77 | CHECK_CUDA(mat); 78 | TORCH_CHECK(mat.size(0) == mat.size(1), "Input matrix must be square."); 79 | 80 | const bool bupper = is_fortran_contig(mat) ? upper : !upper; 81 | const int64_t nx = mat.size(0); 82 | const dim3 dimGrid(ceildiv(nx, NB)); 83 | const dim3 dimBlock(NB); 84 | 85 | AT_DISPATCH_FLOATING_TYPES(mat.scalar_type(), "dispatch_mul_triang", [&] { 86 | const scalar_t mul = (scalar_t)multiplier; 87 | at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(); 88 | at::DeviceGuard g(mat.device()); 89 | if (bupper && preserve_diag) { // U, preserve 90 | mul_upper<<>>(mat.data_ptr(), nx, mul); 91 | } else if (bupper) { // U, no-preserve 92 | mul_upper_diag<<>>(mat.data_ptr(), nx, mul); 93 | } else if (preserve_diag) { // L, preserve 94 | mul_lower<<>>(mat.data_ptr(), nx, mul); 95 | } else { // L, no-preserve 96 | mul_lower_diag<<>>(mat.data_ptr(), nx, mul); 97 | } 98 | }); 99 | return mat; 100 | } 101 | 102 | } // namespace 103 | 104 | TORCH_LIBRARY_IMPL(falkon, CUDA, m) { 105 | m.impl( 106 | TORCH_SELECTIVE_NAME("falkon::mul_triang"), 107 | TORCH_FN(mul_triang_kernel)); 108 | } 109 | 110 | } // namespace ops 111 | } // namespace falkon 112 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cuda_square_norm.cu: -------------------------------------------------------------------------------- 1 | #include "../helpers.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace falkon { 11 | namespace ops { 12 | namespace { 13 | 14 | template 15 | struct NormTwoSquareOps { 16 | inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data, int64_t /*idx*/) const { 17 | return acc + data * data; 18 | } 19 | 20 | inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { 21 | return a + b; 22 | } 23 | 24 | inline C10_DEVICE acc_t project(acc_t a) const { 25 | return a; 26 | } 27 | 28 | static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { 29 | return acc; 30 | } 31 | 32 | #if defined(__CUDACC__) 33 | inline C10_DEVICE acc_t warp_shfl_down(acc_t acc, int offset) const { 34 | return WARP_SHFL_DOWN(acc, offset); 35 | } 36 | #endif 37 | }; 38 | 39 | template 40 | void square_vector_norm_impl(at::TensorIterator& iter) { 41 | if (iter.numel() == 0) { 42 | iter.output().fill_(0); 43 | return; 44 | } 45 | at::native::gpu_reduce_kernel(iter, NormTwoSquareOps(), 0); 46 | } 47 | 48 | at::Tensor square_norm_kernel(const at::Tensor& input, int64_t dim, bool keepdim) { 49 | at::IntArrayRef dimArr = at::IntArrayRef(dim); 50 | at::ScalarType in_dtype = input.scalar_type(); 51 | 52 | // Create the output tensor 53 | auto result_shape = shape_from_dim(input, dim, keepdim); 54 | at::Tensor result = at::empty(result_shape, input.options()); 55 | 56 | at::TensorIterator iter = at::native::make_reduction("vector_sqnorm", result, input, dimArr, keepdim, in_dtype); 57 | AT_DISPATCH_FLOATING_TYPES(iter.input_dtype(), "square_vector_norm_impl", [&] { 58 | square_vector_norm_impl(iter); 59 | }); 60 | return result; 61 | } 62 | 63 | } // namespace 64 | 65 | TORCH_LIBRARY_IMPL(falkon, CUDA, m) { 66 | m.impl( 67 | TORCH_SELECTIVE_NAME("falkon::square_norm"), 68 | TORCH_FN(square_norm_kernel)); 69 | } 70 | 71 | } // namespace ops 72 | } // namespace falkon 73 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cusolver_bindings.cu: -------------------------------------------------------------------------------- 1 | #include "cusolver_bindings.h" 2 | #include "../helpers.h" 3 | #include "cuda_helpers.cuh" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | namespace falkon { 14 | namespace ops { 15 | 16 | /* POTRF Buffer Size */ 17 | template 18 | void potrf_buffersize( 19 | cusolverDnHandle_t handle, 20 | cublasFillMode_t uplo, 21 | int n, 22 | scalar_t* A, 23 | int lda, 24 | int* lwork) { 25 | throw std::invalid_argument("scalar_t"); 26 | } 27 | 28 | template <> 29 | void potrf_buffersize( 30 | cusolverDnHandle_t handle, 31 | cublasFillMode_t uplo, 32 | int n, 33 | float* A, 34 | int lda, 35 | int* lwork) { 36 | FLK_CUSOLVER_CHECK(cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, lwork)); 37 | } 38 | 39 | template <> 40 | void potrf_buffersize( 41 | cusolverDnHandle_t handle, 42 | cublasFillMode_t uplo, 43 | int n, 44 | double* A, 45 | int lda, 46 | int* lwork) { 47 | FLK_CUSOLVER_CHECK(cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, lwork)); 48 | } 49 | 50 | /* POTRF */ 51 | template 52 | void potrf( 53 | cusolverDnHandle_t handle, 54 | cublasFillMode_t uplo, 55 | int n, 56 | scalar_t* A, 57 | int lda, 58 | scalar_t* work, 59 | int lwork, 60 | int* info) { 61 | throw std::invalid_argument("scalar_t"); 62 | } 63 | template<> 64 | void potrf( 65 | cusolverDnHandle_t handle, 66 | cublasFillMode_t uplo, 67 | int n, 68 | float* A, 69 | int lda, 70 | float* work, 71 | int lwork, 72 | int* info) { 73 | FLK_CUSOLVER_CHECK(cusolverDnSpotrf(handle, uplo, n, A, lda, work, lwork, info)); 74 | } 75 | template<> 76 | void potrf( 77 | cusolverDnHandle_t handle, 78 | cublasFillMode_t uplo, 79 | int n, 80 | double* A, 81 | int lda, 82 | double* work, 83 | int lwork, 84 | int* info) { 85 | FLK_CUSOLVER_CHECK(cusolverDnDpotrf(handle, uplo, n, A, lda, work, lwork, info)); 86 | } 87 | 88 | namespace { 89 | 90 | int64_t cusolver_potrf_buffer_size( 91 | at::Tensor &A, 92 | bool upper, 93 | int64_t n, 94 | int64_t lda) { 95 | cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; 96 | int lwork; 97 | 98 | AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "potrf_buffer_size", [&]{ 99 | cusolverDnHandle_t handle = at::cuda::getCurrentCUDASolverDnHandle(); 100 | potrf_buffersize(handle, uplo, (int)n, A.data_ptr(), (int)lda, &lwork); 101 | }); 102 | return (int64_t)lwork; 103 | } 104 | 105 | void cusolver_potrf( 106 | at::Tensor& A, 107 | at::Tensor& workspace, 108 | at::Tensor& info, 109 | int64_t workspace_size, 110 | bool upper, 111 | int64_t n, 112 | int64_t lda) { 113 | cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; 114 | 115 | AT_DISPATCH_FLOATING_TYPES(A.scalar_type(), "potrf", [&]{ 116 | auto handle = at::cuda::getCurrentCUDASolverDnHandle(); 117 | auto A_data = A.data_ptr(); 118 | auto workspace_data = workspace.data_ptr(); 119 | potrf(handle, uplo, (int)n, A_data, (int)lda, workspace_data, (int)workspace_size, info.data_ptr()); 120 | }); 121 | } 122 | 123 | } // namespace 124 | 125 | TORCH_LIBRARY_IMPL(falkon, CUDA, m) { 126 | m.impl( 127 | TORCH_SELECTIVE_NAME("falkon::cusolver_potrf_buffer_size"), 128 | TORCH_FN(cusolver_potrf_buffer_size)); 129 | m.impl( 130 | TORCH_SELECTIVE_NAME("falkon::cusolver_potrf"), 131 | TORCH_FN(cusolver_potrf)); 132 | } 133 | 134 | } // namespace ops 135 | } // namespace falkon 136 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda/cusolver_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace falkon { 7 | namespace ops { 8 | 9 | template 10 | void potrf_buffersize( 11 | cusolverDnHandle_t handle, 12 | cublasFillMode_t uplo, 13 | int n, 14 | scalar_t* A, 15 | int lda, 16 | int* lwork); 17 | 18 | template 19 | void potrf( 20 | cusolverDnHandle_t handle, 21 | cublasFillMode_t uplo, 22 | int n, 23 | scalar_t* A, 24 | int lda, 25 | scalar_t* work, 26 | int lwork, 27 | int* info); 28 | 29 | } // namespace ops 30 | } // namespace falkon 31 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "cuda_bindings.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | void cuda_2d_copy_async( 11 | at::Tensor& dest_tensor, 12 | const int64_t dest_pitch, 13 | const at::Tensor& src_tensor, 14 | const int64_t src_pitch, 15 | const int64_t width, 16 | const int64_t height) { 17 | static auto op = c10::Dispatcher::singleton() 18 | .findSchemaOrThrow("falkon::cuda_2d_copy_async", "") 19 | .typed(); 20 | at::AutoDispatchBelowAutograd guard; 21 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 22 | return op.call( 23 | dest_tensor, 24 | dest_pitch, 25 | src_tensor, 26 | src_pitch, 27 | width, 28 | height 29 | ); 30 | } 31 | void cuda_2d_copy( 32 | at::Tensor& dest_tensor, 33 | const int64_t dest_pitch, 34 | const at::Tensor& src_tensor, 35 | const int64_t src_pitch, 36 | const int64_t width, 37 | const int64_t height) { 38 | static auto op = c10::Dispatcher::singleton() 39 | .findSchemaOrThrow("falkon::cuda_2d_copy", "") 40 | .typed(); 41 | at::AutoDispatchBelowAutograd guard; 42 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 43 | return op.call( 44 | dest_tensor, 45 | dest_pitch, 46 | src_tensor, 47 | src_pitch, 48 | width, 49 | height 50 | ); 51 | } 52 | void cuda_1d_copy_async( 53 | at::Tensor& dest_tensor, 54 | const at::Tensor &src_tensor, 55 | const int64_t count) { 56 | static auto op = c10::Dispatcher::singleton() 57 | .findSchemaOrThrow("falkon::cuda_1d_copy_async", "") 58 | .typed(); 59 | at::AutoDispatchBelowAutograd guard; 60 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 61 | return op.call( 62 | dest_tensor, 63 | src_tensor, 64 | count 65 | ); 66 | } 67 | void cuda_1d_copy( 68 | at::Tensor& dest_tensor, 69 | const at::Tensor &src_tensor, 70 | const int64_t count) { 71 | static auto op = c10::Dispatcher::singleton() 72 | .findSchemaOrThrow("falkon::cuda_1d_copy", "") 73 | .typed(); 74 | at::AutoDispatchBelowAutograd guard; 75 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 76 | return op.call( 77 | dest_tensor, 78 | src_tensor, 79 | count 80 | ); 81 | } 82 | 83 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 84 | // m.def(TORCH_SELECTIVE_SCHEMA( 85 | // "falkon::mem_get_info(int device_id) -> (int, int)")); 86 | m.def(TORCH_SELECTIVE_SCHEMA( 87 | "falkon::cuda_2d_copy_async(Tensor (a!) dest_tensor, int dest_pitch, Tensor src_tensor, int src_pitch, int width, int height) -> ()")); 88 | m.def(TORCH_SELECTIVE_SCHEMA( 89 | "falkon::cuda_2d_copy(Tensor (a!) dest_tensor, int dest_pitch, Tensor src_tensor, int src_pitch, int width, int height) -> ()")); 90 | m.def(TORCH_SELECTIVE_SCHEMA( 91 | "falkon::cuda_1d_copy_async(Tensor (a!) dest_tensor, Tensor src_tensor, int count) -> ()")); 92 | m.def(TORCH_SELECTIVE_SCHEMA( 93 | "falkon::cuda_1d_copy(Tensor (a!) dest_tensor, Tensor src_tensor, int count) -> ()")); 94 | } 95 | 96 | } // namespace ops 97 | } // namespace falkon 98 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cuda_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | std::tuple mem_get_info(int64_t device_id); 8 | void cuda_2d_copy_async( 9 | at::Tensor& dest_tensor, 10 | const int64_t dest_pitch, 11 | const at::Tensor& src_tensor, 12 | const int64_t src_pitch, 13 | const int64_t width, 14 | const int64_t height); 15 | void cuda_2d_copy( 16 | at::Tensor& dest_tensor, 17 | const int64_t dest_pitch, 18 | const at::Tensor& src_tensor, 19 | const int64_t src_pitch, 20 | const int64_t width, 21 | const int64_t height); 22 | void cuda_1d_copy_async( 23 | at::Tensor& dest_tensor, 24 | const at::Tensor &src_tensor, 25 | const int64_t count); 26 | void cuda_1d_copy( 27 | at::Tensor& dest_tensor, 28 | const at::Tensor &src_tensor, 29 | const int64_t count); 30 | } // namespace ops 31 | } // namespace falkon 32 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cusolver_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "cusolver_bindings.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | int64_t cusolver_potrf_buffer_size( 11 | at::Tensor &A, 12 | bool upper, 13 | int64_t n, 14 | int64_t lda) { 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::cusolver_potrf_buffer_size", "") 17 | .typed(); 18 | at::AutoDispatchBelowAutograd guard; 19 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 20 | return op.call( 21 | A, 22 | upper, 23 | n, 24 | lda 25 | ); 26 | } 27 | void cusolver_potrf( 28 | at::Tensor& A, 29 | at::Tensor& workspace, 30 | at::Tensor& info, 31 | int64_t workspace_size, 32 | bool upper, 33 | int64_t n, 34 | int64_t lda) { 35 | static auto op = c10::Dispatcher::singleton() 36 | .findSchemaOrThrow("falkon::cusolver_potrf", "") 37 | .typed(); 38 | at::AutoDispatchBelowAutograd guard; 39 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 40 | return op.call( 41 | A, 42 | workspace, 43 | info, 44 | workspace_size, 45 | upper, 46 | n, 47 | lda 48 | ); 49 | } 50 | 51 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 52 | m.def(TORCH_SELECTIVE_SCHEMA( 53 | "falkon::cusolver_potrf_buffer_size(Tensor(a!) A, bool upper, int n, int lda) -> int")); 54 | m.def(TORCH_SELECTIVE_SCHEMA( 55 | "falkon::cusolver_potrf(Tensor(a!) A, Tensor(b!) workspace, Tensor(c!) info, int workspace_size, bool upper, int n, int lda) -> ()")); 56 | } 57 | 58 | } // namespace ops 59 | } // namespace falkon 60 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/cusolver_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | int64_t cusolver_potrf_buffer_size( 9 | at::Tensor &A, 10 | bool upper, 11 | int64_t n, 12 | int64_t lda); 13 | void cusolver_potrf( 14 | at::Tensor& A, 15 | at::Tensor& workspace, 16 | at::Tensor& info, 17 | int64_t workspace_size, 18 | bool upper, 19 | int64_t n, 20 | int64_t lda); 21 | 22 | } // namespace ops 23 | } // namespace falkon 24 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/helpers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace falkon { 7 | namespace ops { 8 | 9 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor.") 10 | #define CHECK_CPU(x) TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor.") 11 | 12 | inline at::DimVector shape_from_dim(const at::Tensor& tensor, int64_t dim, bool keepdim) { 13 | auto shape = at::DimVector(tensor.sizes()); 14 | if (dim < 0) { 15 | // e.g. 3-d tensor, dim is -1 => dim will be 2 16 | dim = shape.size() + dim; 17 | } 18 | for (int64_t d = shape.size() - 1; d >= 0; d--) { 19 | if (d == dim) { 20 | if (keepdim) { 21 | shape[d] = 1; 22 | } else { 23 | shape.erase(shape.begin() + d); 24 | } 25 | } 26 | } 27 | return shape; 28 | } 29 | 30 | inline bool is_fortran_contig(const at::Tensor &matrix) { 31 | return matrix.stride(0) == 1; 32 | } 33 | 34 | inline int ceildiv(int dividend, int divisor) { 35 | int res = dividend / divisor; 36 | if (dividend % divisor != 0) 37 | res++; 38 | return res; 39 | } 40 | 41 | } // namespace ops 42 | } // namespace falkon 43 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/lauum.cpp: -------------------------------------------------------------------------------- 1 | #include "lauum.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor lauum( 11 | const int64_t n, 12 | const at::Tensor &A, 13 | const int64_t lda, 14 | at::Tensor &B, 15 | const int64_t ldb, 16 | const bool lower) { 17 | static auto op = c10::Dispatcher::singleton() 18 | .findSchemaOrThrow("falkon::lauum", "") 19 | .typed(); 20 | at::AutoDispatchBelowAutograd guard; 21 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 22 | return op.call( 23 | n, 24 | A, 25 | lda, 26 | B, 27 | ldb, 28 | lower 29 | ); 30 | } 31 | 32 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 33 | m.def(TORCH_SELECTIVE_SCHEMA( 34 | "falkon::lauum(int n, Tensor A, int lda, Tensor(a!) B, int ldb, bool lower) -> Tensor(a!)")); 35 | } 36 | 37 | } // namespace ops 38 | } // namespace falkon 39 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/lauum.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor lauum( 9 | const int64_t n, 10 | const at::Tensor &A, 11 | const int64_t lda, 12 | at::Tensor &B, 13 | const int64_t ldb, 14 | const bool lower); 15 | 16 | } // namespace ops 17 | } // namespace falkon 18 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/mul_triang.cpp: -------------------------------------------------------------------------------- 1 | #include "mul_triang.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor mul_triang( 11 | at::Tensor &mat, 12 | const double multiplier, 13 | const bool upper, 14 | const bool preserve_diag) { 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::mul_triang", "") 17 | .typed(); 18 | at::AutoDispatchBelowAutograd guard; 19 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 20 | return op.call( 21 | mat, 22 | multiplier, 23 | upper, 24 | preserve_diag 25 | ); 26 | } 27 | 28 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 29 | m.def(TORCH_SELECTIVE_SCHEMA( 30 | "falkon::mul_triang(Tensor(a!) mat, float multiplier, bool upper, bool preserve_diag) -> Tensor(a!)")); 31 | } 32 | 33 | } // namespace ops 34 | } // namespace falkon 35 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/mul_triang.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor mul_triang( 9 | at::Tensor &mat, 10 | const double multiplier, 11 | const bool upper, 12 | const bool preserve_diag); 13 | 14 | } // namespace ops 15 | } // namespace falkon 16 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "copy_transpose.h" 4 | #include "copy_triang.h" 5 | #include "csr2dense.h" 6 | #include "cublas_bindings.h" 7 | #include "lauum.h" 8 | #include "mul_triang.h" 9 | #include "sparse_vector_ops.h" 10 | #include "spspmm.h" 11 | #include "square_norm.h" 12 | #include "vec_mul_triang.h" 13 | #include "cuda/parallel_potrf.h" 14 | #include "potrf.h" 15 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/potrf.cpp: -------------------------------------------------------------------------------- 1 | #include "potrf.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor potrf( 11 | at::Tensor &mat, 12 | bool upper, 13 | bool clean, 14 | bool overwrite) { 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::potrf", "") 17 | .typed(); 18 | at::AutoDispatchBelowAutograd guard; 19 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 20 | return op.call( 21 | mat, 22 | upper, 23 | clean, 24 | overwrite 25 | ); 26 | } 27 | 28 | at::Tensor parallel_potrf( 29 | c10::IntArrayRef devices, 30 | c10::IntArrayRef block_starts, 31 | c10::IntArrayRef block_ends, 32 | c10::IntArrayRef block_sizes, 33 | c10::IntArrayRef block_devices, 34 | c10::IntArrayRef block_ids, 35 | at::Tensor& A) { 36 | static auto op = c10::Dispatcher::singleton() 37 | .findSchemaOrThrow("falkon::parallel_potrf", "") 38 | .typed(); 39 | at::AutoDispatchBelowAutograd guard; 40 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 41 | return op.call( 42 | devices, 43 | block_starts, 44 | block_ends, 45 | block_sizes, 46 | block_devices, 47 | block_ids, 48 | A 49 | ); 50 | } 51 | 52 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 53 | m.def(TORCH_SELECTIVE_SCHEMA( 54 | "falkon::potrf(Tensor(a!) mat, bool upper, bool clean, bool overwrite) -> Tensor(a!)")); 55 | m.def(TORCH_SELECTIVE_SCHEMA( 56 | "falkon::parallel_potrf(int[] devices, int[] block_starts, int[] block_ends, int[] block_sizes, int[] block_devices, int[] block_ids, Tensor(a!) A) -> Tensor(a!)")); 57 | } 58 | 59 | } // namespace ops 60 | } // namespace falkon 61 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/potrf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor potrf( 9 | at::Tensor &mat, 10 | bool upper, 11 | bool clean, 12 | bool overwrite); 13 | 14 | at::Tensor parallel_potrf( 15 | c10::IntArrayRef devices, 16 | c10::IntArrayRef block_starts, 17 | c10::IntArrayRef block_ends, 18 | c10::IntArrayRef block_sizes, 19 | c10::IntArrayRef block_devices, 20 | c10::IntArrayRef block_ids, 21 | at::Tensor& A); 22 | 23 | } // namespace ops 24 | } // namespace falkon 25 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/sparse_vector_ops.cpp: -------------------------------------------------------------------------------- 1 | #include "sparse_vector_ops.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor sparse_norm( 11 | const at::Tensor &indexptr, 12 | const at::Tensor &data, 13 | at::Tensor &out) { 14 | static auto op = c10::Dispatcher::singleton() 15 | .findSchemaOrThrow("falkon::sparse_norm", "") 16 | .typed(); 17 | at::AutoDispatchBelowAutograd guard; 18 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 19 | return op.call( 20 | indexptr, 21 | data, 22 | out 23 | ); 24 | } 25 | at::Tensor sparse_square_norm( 26 | const at::Tensor &indexptr, 27 | const at::Tensor &data, 28 | at::Tensor &out) { 29 | static auto op = c10::Dispatcher::singleton() 30 | .findSchemaOrThrow("falkon::sparse_square_norm", "") 31 | .typed(); 32 | at::AutoDispatchBelowAutograd guard; 33 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 34 | return op.call( 35 | indexptr, 36 | data, 37 | out 38 | ); 39 | } 40 | at::Tensor sparse_bdot( 41 | const at::Tensor &indexptr1, 42 | const at::Tensor &indices1, 43 | const at::Tensor &data1, 44 | const at::Tensor &indexptr2, 45 | const at::Tensor &indices2, 46 | const at::Tensor &data2, 47 | at::Tensor &out) { 48 | static auto op = c10::Dispatcher::singleton() 49 | .findSchemaOrThrow("falkon::sparse_bdot", "") 50 | .typed(); 51 | at::AutoDispatchBelowAutograd guard; 52 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 53 | return op.call( 54 | indexptr1, 55 | indices1, 56 | data1, 57 | indexptr2, 58 | indices2, 59 | data2, 60 | out 61 | ); 62 | } 63 | 64 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 65 | m.def(TORCH_SELECTIVE_SCHEMA( 66 | "falkon::sparse_square_norm(Tensor indexptr, Tensor data, *, Tensor(a!) out) -> Tensor(a!)")); 67 | m.def(TORCH_SELECTIVE_SCHEMA( 68 | "falkon::sparse_norm(Tensor indexptr, Tensor data, *, Tensor(a!) out) -> Tensor(a!)")); 69 | m.def(TORCH_SELECTIVE_SCHEMA( 70 | "falkon::sparse_bdot(Tensor indexptr1, Tensor indices1, Tensor data1, Tensor indexptr2, Tensor indices2, Tensor data2, *, Tensor (a!) out) -> Tensor(a!)")); 71 | } 72 | 73 | } // namespace ops 74 | } // namespace falkon 75 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/sparse_vector_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace falkon { 7 | namespace ops { 8 | 9 | at::Tensor sparse_bdot( 10 | const at::Tensor &indexptr1, 11 | const at::Tensor &indices1, 12 | const at::Tensor &data1, 13 | const at::Tensor &indexptr2, 14 | const at::Tensor &indices2, 15 | const at::Tensor &data2, 16 | at::Tensor &out); 17 | at::Tensor sparse_square_norm( 18 | const at::Tensor &indexptr, 19 | const at::Tensor &data, 20 | at::Tensor &out); 21 | at::Tensor sparse_norm( 22 | const at::Tensor &indexptr, 23 | const at::Tensor &data, 24 | at::Tensor &out); 25 | 26 | } // namespace ops 27 | } // namespace falkon 28 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/spspmm.cpp: -------------------------------------------------------------------------------- 1 | #include "spspmm.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | std::tuple 11 | spspmm( 12 | const at::Tensor &rowptrA, 13 | const at::Tensor &colA, 14 | const at::Tensor &valA, 15 | const at::Tensor &rowptrB, 16 | const at::Tensor &colB, 17 | const at::Tensor &valB, 18 | int64_t N) { 19 | static auto op = c10::Dispatcher::singleton() 20 | .findSchemaOrThrow("falkon::spspmm", "") 21 | .typed(); 22 | at::AutoDispatchBelowAutograd guard; 23 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 24 | return op.call( 25 | rowptrA, 26 | colA, 27 | valA, 28 | rowptrB, 29 | colB, 30 | valB, 31 | N 32 | ); 33 | } 34 | 35 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 36 | m.def(TORCH_SELECTIVE_SCHEMA( 37 | "falkon::spspmm(Tensor rowptrA, Tensor colA, Tensor valA, Tensor rowptrB, Tensor colB, Tensor valB, int N) -> (Tensor, Tensor, Tensor)")); 38 | } 39 | 40 | } // namespace ops 41 | } // namespace falkon 42 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/spspmm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | std::tuple 9 | spspmm( 10 | const at::Tensor &rowptrA, 11 | const at::Tensor &colA, 12 | const at::Tensor &valA, 13 | const at::Tensor &rowptrB, 14 | const at::Tensor &colB, 15 | const at::Tensor &valB, 16 | int64_t N); 17 | 18 | } // namespace ops 19 | } // namespace falkon 20 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/square_norm.cpp: -------------------------------------------------------------------------------- 1 | #include "square_norm.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor square_norm( 11 | const at::Tensor &self, 12 | int64_t dim, 13 | bool keepdim) { 14 | 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::square_norm", "") 17 | .typed(); 18 | return op.call( 19 | self, dim, keepdim 20 | ); 21 | } 22 | 23 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 24 | m.def(TORCH_SELECTIVE_SCHEMA( 25 | "falkon::square_norm(Tensor self, int dim, bool keepdim=False) -> Tensor")); 26 | } 27 | 28 | } // namespace ops 29 | } // namespace falkon 30 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/square_norm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor square_norm( 9 | const at::Tensor &self, 10 | int64_t dim, 11 | bool keepdim=false); 12 | 13 | } // namespace ops 14 | } // namespace falkon 15 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/vec_mul_triang.cpp: -------------------------------------------------------------------------------- 1 | #include "vec_mul_triang.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace falkon { 8 | namespace ops { 9 | 10 | at::Tensor vec_mul_triang( 11 | at::Tensor &mat, 12 | const at::Tensor &multiplier_vec, 13 | const bool upper, 14 | const bool side) { 15 | static auto op = c10::Dispatcher::singleton() 16 | .findSchemaOrThrow("falkon::vec_mul_triang", "") 17 | .typed(); 18 | at::AutoDispatchBelowAutograd guard; 19 | at::tracer::impl::NoTracerDispatchMode tracer_guard; 20 | return op.call( 21 | mat, 22 | multiplier_vec, 23 | upper, 24 | side 25 | ); 26 | } 27 | 28 | TORCH_LIBRARY_FRAGMENT(falkon, m) { 29 | m.def(TORCH_SELECTIVE_SCHEMA( 30 | "falkon::vec_mul_triang(Tensor (a!) mat, Tensor multiplier_vec, bool upper, bool side) -> Tensor (a!)")); 31 | } 32 | 33 | } // namespace ops 34 | } // namespace falkon 35 | -------------------------------------------------------------------------------- /falkon/c_ext/ops/vec_mul_triang.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace falkon { 6 | namespace ops { 7 | 8 | at::Tensor vec_mul_triang( 9 | at::Tensor &mat, 10 | const at::Tensor &multiplier_vec, 11 | const bool upper, 12 | const bool side); 13 | 14 | } // namespace ops 15 | } // namespace falkon 16 | -------------------------------------------------------------------------------- /falkon/hopt/README.md: -------------------------------------------------------------------------------- 1 | # Gradient-Based Hyperparameter Optimization Module 2 | 3 | The most interesting code for users lies in the `objectives` submodule, 4 | which contains a collection of several optimization objectives 5 | (i.e. penalized losses), which can be used to optimize the hyperparameters 6 | of (Nystrom) kernel ridge regression. 7 | 8 | There are several exact objectives, which typically require storing the 9 | full K_{nm} kernel matrix in memory, and are therefore suited for medium size 10 | problems. 11 | One approximate objective is also implemented (the `StochasticNystromCompReg` 12 | class), which can scale up to much larger problems. This stochastic objective 13 | might be somewhat hard to tune in order to obtain good performmance. 14 | 15 | The `optimization` submodule contains helpers for gradient-based optimization, 16 | error / performance reports, and a grid-search implementation. 17 | 18 | The `benchmarking` submodule contains a large runner with several parameters which 19 | has been used to run experiments on hyperparameter optimization. 20 | -------------------------------------------------------------------------------- /falkon/hopt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/hopt/__init__.py -------------------------------------------------------------------------------- /falkon/hopt/benchmarking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/hopt/benchmarking/__init__.py -------------------------------------------------------------------------------- /falkon/hopt/objectives/__init__.py: -------------------------------------------------------------------------------- 1 | from .exact_objectives.compreg import CompReg 2 | from .exact_objectives.gcv import GCV 3 | from .exact_objectives.holdout import HoldOut 4 | from .exact_objectives.loocv import LOOCV 5 | from .exact_objectives.new_compreg import NystromCompReg 6 | from .exact_objectives.sgpr import SGPR 7 | from .stoch_objectives.stoch_new_compreg import StochasticNystromCompReg 8 | 9 | __all__ = ( 10 | "CompReg", 11 | "NystromCompReg", 12 | "HoldOut", 13 | "SGPR", 14 | "GCV", 15 | "LOOCV", 16 | "StochasticNystromCompReg", 17 | ) 18 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/hopt/objectives/exact_objectives/__init__.py -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/compreg.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import torch 4 | 5 | import falkon.kernels 6 | from falkon.hopt.objectives.exact_objectives.utils import jittering_cholesky 7 | from falkon.hopt.objectives.objectives import HyperoptObjective 8 | from falkon.hopt.utils import get_scalar 9 | 10 | 11 | class CompReg(HyperoptObjective): 12 | def __init__( 13 | self, 14 | kernel: falkon.kernels.DiffKernel, 15 | centers_init: torch.Tensor, 16 | penalty_init: torch.Tensor, 17 | opt_centers: bool, 18 | opt_penalty: bool, 19 | centers_transform: Optional[torch.distributions.Transform] = None, 20 | pen_transform: Optional[torch.distributions.Transform] = None, 21 | ): 22 | super().__init__(kernel, centers_init, penalty_init, opt_centers, opt_penalty, centers_transform, pen_transform) 23 | self.x_train, self.y_train = None, None 24 | self.losses: Optional[Dict[str, torch.Tensor]] = None 25 | 26 | def forward(self, X, Y): 27 | self.x_train, self.y_train = X.detach(), Y.detach() 28 | variance = self.penalty * X.shape[0] 29 | sqrt_var = torch.sqrt(variance) 30 | 31 | L, A, LB, c = self._calc_intermediate(X, Y) 32 | C = torch.linalg.solve_triangular(LB, A / sqrt_var, upper=False) # m*n 33 | 34 | ndeff = C.square().sum() 35 | datafit = torch.square(Y).sum() - torch.square(c * sqrt_var).sum() 36 | self._save_losses(ndeff, datafit) 37 | 38 | return ndeff + datafit 39 | 40 | def predict(self, X): 41 | if self.x_train is None or self.y_train is None: 42 | raise RuntimeError("Call forward at least once before calling predict.") 43 | with torch.autograd.no_grad(): 44 | L, A, LB, c = self._calc_intermediate(self.x_train, self.y_train) 45 | tmp1 = torch.linalg.solve_triangular(LB.T, c, upper=True) 46 | tmp2 = torch.linalg.solve_triangular(L.T, tmp1, upper=True) 47 | kms = self.kernel(self.centers, X) 48 | return kms.T @ tmp2 49 | 50 | def _calc_intermediate(self, X, Y): 51 | variance = self.penalty * X.shape[0] 52 | sqrt_var = torch.sqrt(variance) 53 | kmn = self.kernel(self.centers, X) 54 | kmm = self.kernel(self.centers, self.centers) 55 | L = jittering_cholesky(kmm) # L @ L.T = kmm 56 | # A = L^{-1} K_mn / (sqrt(n*pen)) 57 | A = torch.linalg.solve_triangular(L, kmn, upper=False) 58 | AAT = A @ A.T # m*n @ n*m = m*m in O(n * m^2), equivalent to kmn @ knm. 59 | # B = A @ A.T + I 60 | B = AAT / variance + torch.eye(AAT.shape[0], device=X.device, dtype=X.dtype) 61 | LB = jittering_cholesky(B) # LB @ LB.T = B 62 | AY = A @ Y / sqrt_var # m*1 63 | c = torch.linalg.solve_triangular(LB, AY, upper=False) / sqrt_var # m*1 64 | 65 | return L, A, LB, c 66 | 67 | def _save_losses(self, effective_dimension, data_fit): 68 | self.losses = { 69 | "effective_dimension": effective_dimension.detach(), 70 | "data_fit": data_fit.detach(), 71 | } 72 | 73 | def __repr__(self): 74 | return ( 75 | f"CregNoTrace(" 76 | f"kernel={self.kernel}, " 77 | f"penalty={get_scalar(self.penalty)}, " 78 | f"num_centers={self.centers.shape[0]})" 79 | ) 80 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/holdout.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import torch 4 | 5 | import falkon.kernels 6 | from falkon.hopt.objectives.exact_objectives.utils import jittering_cholesky 7 | from falkon.hopt.objectives.objectives import HyperoptObjective 8 | from falkon.hopt.utils import get_scalar 9 | 10 | 11 | class HoldOut(HyperoptObjective): 12 | def __init__( 13 | self, 14 | kernel: falkon.kernels.DiffKernel, 15 | centers_init: torch.Tensor, 16 | penalty_init: torch.Tensor, 17 | opt_centers: bool, 18 | opt_penalty: bool, 19 | val_pct: float, 20 | per_iter_split: bool, 21 | centers_transform: Optional[torch.distributions.Transform] = None, 22 | pen_transform: Optional[torch.distributions.Transform] = None, 23 | ): 24 | super().__init__(kernel, centers_init, penalty_init, opt_centers, opt_penalty, centers_transform, pen_transform) 25 | self.x_train, self.y_train = None, None 26 | self.losses: Optional[Dict[str, torch.Tensor]] = None 27 | self.per_iter_split = per_iter_split 28 | self.val_pct = val_pct 29 | self.tr_indices, self.val_indices = None, None 30 | 31 | def forward(self, X, Y): 32 | # X_tr, Y_tr are used for predictions. They contain the whole dataset (=retraining) 33 | self.x_train, self.y_train = X, Y 34 | if self.tr_indices is None or self.per_iter_split: 35 | num_val = int(X.shape[0] * self.val_pct) 36 | all_idx = torch.randperm(X.shape[0]) 37 | self.val_indices = all_idx[:num_val] 38 | self.tr_indices = all_idx[num_val:] 39 | 40 | Xtr = X[self.tr_indices] 41 | Xval = X[self.val_indices] 42 | Ytr = Y[self.tr_indices] 43 | Yval = Y[self.val_indices] 44 | 45 | kmval = self.kernel(self.centers, Xval) 46 | alpha = self._calc_intermediate(Xtr, Ytr) 47 | val_preds = kmval.T @ alpha 48 | loss = torch.mean(torch.square(Yval - val_preds)) 49 | 50 | self._save_losses(loss) 51 | return loss 52 | 53 | def predict(self, X): 54 | if self.x_train is None or self.y_train is None: 55 | raise RuntimeError("Call forward at least once before calling predict.") 56 | with torch.autograd.no_grad(): 57 | alpha = self._calc_intermediate(self.x_train, self.y_train) 58 | kms = self.kernel(self.centers, X) 59 | return kms.T @ alpha 60 | 61 | @property 62 | def train_pct(self): 63 | return 100.0 - self.val_pct 64 | 65 | def _calc_intermediate(self, X, Y): 66 | variance = self.penalty * X.shape[0] 67 | sqrt_var = torch.sqrt(variance) 68 | 69 | kmn = self.kernel(self.centers, X) 70 | kmm = self.kernel(self.centers, self.centers) 71 | L = jittering_cholesky(kmm) # L @ L.T = kmm 72 | # A = L^{-1} K_mn / (sqrt(n*pen)) 73 | A = torch.linalg.solve_triangular(L, kmn, upper=False) / sqrt_var 74 | AAT = A @ A.T 75 | # B = A @ A.T + I 76 | B = AAT + torch.eye(AAT.shape[0], device=X.device, dtype=X.dtype) 77 | LB = jittering_cholesky(B) # LB @ LB.T = B 78 | AYtr = A @ Y 79 | c = torch.linalg.solve_triangular(LB, AYtr, upper=False) / sqrt_var 80 | 81 | tmp1 = torch.linalg.solve_triangular(LB.T, c, upper=True) 82 | alpha = torch.linalg.solve_triangular(L.T, tmp1, upper=True) 83 | return alpha 84 | 85 | def _save_losses(self, holdout): 86 | self.losses = { 87 | "hold-out": holdout.detach(), 88 | } 89 | 90 | def __repr__(self): 91 | return ( 92 | f"NystromHoldOut(" 93 | f"kernel={self.kernel}, " 94 | f"penalty={get_scalar(self.penalty)}, " 95 | f"num_centers={self.centers.shape[0]}, " 96 | f"val_pct={self.val_pct}, " 97 | f"per_iter_split={self.per_iter_split})" 98 | ) 99 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/new_compreg.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import torch 4 | 5 | import falkon 6 | from falkon.hopt.objectives.exact_objectives.utils import jittering_cholesky 7 | from falkon.hopt.objectives.objectives import HyperoptObjective 8 | from falkon.hopt.utils import get_scalar 9 | 10 | 11 | class NystromCompReg(HyperoptObjective): 12 | def __init__( 13 | self, 14 | kernel: falkon.kernels.DiffKernel, 15 | centers_init: torch.Tensor, 16 | penalty_init: torch.Tensor, 17 | opt_centers: bool, 18 | opt_penalty: bool, 19 | centers_transform: Optional[torch.distributions.Transform] = None, 20 | pen_transform: Optional[torch.distributions.Transform] = None, 21 | ): 22 | super().__init__(kernel, centers_init, penalty_init, opt_centers, opt_penalty, centers_transform, pen_transform) 23 | self.x_train, self.y_train = None, None 24 | self.losses: Optional[Dict[str, torch.Tensor]] = None 25 | 26 | def forward(self, X, Y): 27 | self.x_train, self.y_train = X.detach(), Y.detach() 28 | variance = self.penalty * X.shape[0] 29 | sqrt_var = torch.sqrt(variance) 30 | Kdiag = self.kernel(X, X, diag=True).sum() 31 | 32 | L, A, AAT, LB, c = self._calc_intermediate(X, Y) 33 | C = torch.linalg.solve_triangular(LB, A, upper=False) # m * n 34 | 35 | datafit = torch.square(Y).sum() - torch.square(c / sqrt_var).sum() 36 | ndeff = (C / sqrt_var).square().sum() 37 | trace = Kdiag - torch.trace(AAT) 38 | trace = trace * datafit / (variance * X.shape[0]) 39 | self._save_losses(ndeff, datafit, trace) 40 | 41 | return ndeff + datafit + trace 42 | 43 | def predict(self, X): 44 | if self.x_train is None or self.y_train is None: 45 | raise RuntimeError("Call forward at least once before calling predict.") 46 | with torch.autograd.no_grad(): 47 | L, A, AAT, LB, c = self._calc_intermediate(self.x_train, self.y_train) 48 | tmp1 = torch.linalg.solve_triangular(LB.T, c, upper=True) 49 | tmp2 = torch.linalg.solve_triangular(L.T, tmp1, upper=True) 50 | kms = self.kernel(self.centers, X) 51 | return kms.T @ tmp2 52 | 53 | def _calc_intermediate(self, X, Y): 54 | variance = self.penalty * X.shape[0] 55 | 56 | kmn = self.kernel(self.centers, X) 57 | kmm = self.kernel(self.centers, self.centers) 58 | 59 | L = jittering_cholesky(kmm) 60 | A = torch.linalg.solve_triangular(L, kmn, upper=False) 61 | AAT = A @ A.T # m*n @ n*m = m*m in O(n * m^2), equivalent to kmn @ knm. 62 | # B = A @ A.T + I 63 | B = AAT / variance + torch.eye(AAT.shape[0], device=X.device, dtype=X.dtype) 64 | LB = jittering_cholesky(B) # LB @ LB.T = B 65 | 66 | AY = A @ Y # m*1 67 | c = torch.linalg.solve_triangular(LB, AY, upper=False) # m * p 68 | 69 | return L, A, AAT, LB, c 70 | 71 | def _save_losses(self, effective_dimension, data_fit, kernel_trace): 72 | self.losses = { 73 | "effective_dimension": effective_dimension.detach(), 74 | "data_fit": data_fit.detach(), 75 | "trace": kernel_trace.detach(), 76 | } 77 | 78 | def __repr__(self): 79 | return ( 80 | f"NystromCompReg(" 81 | f"kernel={self.kernel}, " 82 | f"penalty={get_scalar(self.penalty)}, " 83 | f"num_centers={self.centers.shape[0]})" 84 | ) 85 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/sgpr.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | import torch 4 | 5 | import falkon.kernels 6 | from falkon.hopt.objectives.exact_objectives.utils import jittering_cholesky 7 | from falkon.hopt.objectives.objectives import HyperoptObjective 8 | from falkon.hopt.utils import get_scalar 9 | 10 | 11 | class SGPR(HyperoptObjective): 12 | def __init__( 13 | self, 14 | kernel: falkon.kernels.DiffKernel, 15 | centers_init: torch.Tensor, 16 | penalty_init: torch.Tensor, 17 | opt_centers: bool, 18 | opt_penalty: bool, 19 | centers_transform: Optional[torch.distributions.Transform] = None, 20 | pen_transform: Optional[torch.distributions.Transform] = None, 21 | ): 22 | super().__init__(kernel, centers_init, penalty_init, opt_centers, opt_penalty, centers_transform, pen_transform) 23 | self.x_train, self.y_train = None, None 24 | self.losses: Optional[Dict[str, torch.Tensor]] = None 25 | 26 | def forward(self, X, Y): 27 | self.x_train, self.y_train = X.detach(), Y.detach() 28 | 29 | Kdiag = self.kernel(X, X, diag=True).sum() 30 | variance = self.penalty * X.shape[0] 31 | L, A, AAT, LB, c = self._calc_intermediate(X, Y) 32 | 33 | # Complexity 34 | logdet = torch.log(torch.diag(LB)).sum() 35 | logdet += 0.5 * X.shape[0] * torch.log(variance) 36 | # Data-fit 37 | datafit = 0.5 * torch.square(Y).sum() / variance 38 | datafit -= 0.5 * torch.square(c).sum() 39 | # Traces (minimize) 40 | trace = 0.5 * Kdiag / variance 41 | trace -= 0.5 * torch.diag(AAT).sum() 42 | 43 | # const = 0.5 * X.shape[0] * torch.log(torch.tensor(2 * np.pi, dtype=X.dtype)) 44 | 45 | self._save_losses(logdet, datafit, trace) 46 | return logdet + datafit + trace 47 | 48 | def predict(self, X): 49 | if self.x_train is None or self.y_train is None: 50 | raise RuntimeError("Call forward at least once before calling predict.") 51 | with torch.autograd.no_grad(): 52 | L, A, AAT, LB, c = self._calc_intermediate(self.x_train, self.y_train) 53 | kms = self.kernel(self.centers, X) 54 | tmp1 = torch.linalg.solve_triangular(L, kms, upper=False) 55 | tmp2 = torch.linalg.solve_triangular(LB, tmp1, upper=False) 56 | return tmp2.T @ c 57 | 58 | def _save_losses(self, log_det, datafit, trace): 59 | self.losses = { 60 | "log_det": log_det.detach(), 61 | "data_fit": datafit.detach(), 62 | "trace": trace.detach(), 63 | } 64 | 65 | def _calc_intermediate(self, X, Y): 66 | variance = self.penalty * X.shape[0] 67 | sqrt_var = torch.sqrt(variance) 68 | 69 | kmn = self.kernel(self.centers, X) 70 | kmm = self.kernel(self.centers, self.centers) 71 | L = jittering_cholesky(kmm) 72 | 73 | # A = L^{-1} K_mn / (sqrt(n*pen)) 74 | A = torch.linalg.solve_triangular(L, kmn, upper=False) / sqrt_var 75 | AAT = A @ A.T 76 | # B = A @ A.T + I 77 | B = AAT + torch.eye(AAT.shape[0], device=X.device, dtype=X.dtype) 78 | LB = jittering_cholesky(B) # LB @ LB.T = B 79 | AY = A @ Y 80 | c = torch.linalg.solve_triangular(LB, AY, upper=False) / sqrt_var 81 | 82 | return L, A, AAT, LB, c 83 | 84 | def __repr__(self): 85 | return ( 86 | f"SGPR(" 87 | f"kernel={self.kernel}, " 88 | f"penalty={get_scalar(self.penalty)}, " 89 | f"num_centers={self.centers.shape[0]})" 90 | ) 91 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/exact_objectives/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def cholesky(M, upper=False, check_errors=True): 5 | if upper: 6 | U, info = torch.linalg.cholesky_ex(M.transpose(-2, -1).conj()) 7 | if check_errors: 8 | if info > 0: 9 | raise RuntimeError("Cholesky failed on row %d" % (info)) 10 | return U.transpose(-2, -1).conj() 11 | else: 12 | L, info = torch.linalg.cholesky_ex(M, check_errors=False) 13 | if check_errors: 14 | if info > 0: 15 | raise RuntimeError("Cholesky failed on row %d" % (info)) 16 | return L 17 | 18 | 19 | def jittering_cholesky(mat, upper=False): 20 | eye = torch.eye(mat.shape[0], device=mat.device, dtype=mat.dtype) 21 | epsilons = [1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0] 22 | last_exception = None 23 | for eps in epsilons: 24 | try: 25 | return cholesky(mat + eye * eps, upper=upper, check_errors=True) 26 | except RuntimeError as e: # noqa: PERF203 27 | last_exception = e 28 | raise last_exception 29 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/objectives.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Optional 3 | 4 | import torch 5 | from torch.distributions.transforms import identity_transform 6 | 7 | import falkon.kernels 8 | from falkon.hopt.objectives.transforms import PositiveTransform 9 | 10 | 11 | class HyperoptObjective(torch.nn.Module): 12 | def __init__( 13 | self, 14 | kernel: falkon.kernels.DiffKernel, 15 | centers_init: torch.Tensor, 16 | penalty_init: torch.Tensor, 17 | opt_centers: bool, 18 | opt_penalty: bool, 19 | centers_transform: Optional[torch.distributions.Transform], 20 | pen_transform: Optional[torch.distributions.Transform], 21 | ): 22 | """ 23 | 24 | Parameters 25 | ---------- 26 | kernel 27 | centers_init 28 | penalty_init 29 | opt_centers 30 | opt_penalty 31 | centers_transform 32 | pen_transform 33 | """ 34 | super().__init__() 35 | 36 | if not isinstance(kernel, falkon.kernels.DiffKernel): 37 | raise TypeError("Kernel must inherit from `DiffKernel` for hyperparameter optimization.") 38 | self.kernel = kernel 39 | 40 | self.centers_transform = centers_transform or identity_transform 41 | self.penalty_transform = pen_transform or PositiveTransform(1e-8) 42 | 43 | # Apply inverse transformations 44 | centers_init = self.centers_transform.inv(centers_init) 45 | penalty_init = self.penalty_transform.inv(penalty_init) 46 | 47 | if opt_centers: 48 | self.register_parameter("centers_", torch.nn.Parameter(centers_init)) 49 | else: 50 | self.register_buffer("centers_", centers_init) 51 | if opt_penalty: 52 | self.register_parameter("penalty_", torch.nn.Parameter(penalty_init)) 53 | else: 54 | self.register_buffer("penalty_", penalty_init) 55 | 56 | @property 57 | def penalty(self): 58 | return self.penalty_transform(self.penalty_) 59 | 60 | @property 61 | def centers(self): 62 | return self.centers_ 63 | 64 | @abc.abstractmethod 65 | def predict(self, X): 66 | pass 67 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/stoch_objectives/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/hopt/objectives/stoch_objectives/__init__.py -------------------------------------------------------------------------------- /falkon/hopt/objectives/stoch_objectives/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence, Tuple 2 | 3 | import torch 4 | 5 | 6 | def init_random_vecs(n, t, dtype, device, gaussian_random: bool): 7 | if gaussian_random: 8 | Z = torch.randn(n, t, dtype=dtype, device=device) 9 | else: 10 | Z = torch.empty(n, t, dtype=dtype, device=device).bernoulli_().mul_(2).sub_(1) 11 | return Z 12 | 13 | 14 | def calc_grads_tensors( 15 | inputs: Sequence[torch.Tensor], 16 | inputs_need_grad: Sequence[bool], 17 | num_nondiff_inputs: int, 18 | output: torch.Tensor, 19 | retain_graph: bool, 20 | allow_unused: bool, 21 | ) -> Tuple[Optional[torch.Tensor], ...]: 22 | """ 23 | 24 | Parameters 25 | ---------- 26 | inputs 27 | Sequence of tensors with respect to which the gradient needs computing 28 | inputs_need_grad 29 | Sequence of booleans, stating whether the inputs need the gradient computation. 30 | This sequence corresponds to ctx.needs_input_grad hence it includes all inputs 31 | to some nn.Function, not just the differentiable inputs (which are passed in the `inputs` 32 | parameter). 33 | Hence `len(inputs_need_grad) != len(inputs)`. To make the code work, the inputs to the 34 | nn.Function we are dealing with must be organized such that the non-differentiable inputs 35 | come before the potentially differentiable inputs! 36 | num_nondiff_inputs: int 37 | The number of non-differentiable inputs to the nn.Function. 38 | output 39 | output of the differentiated function 40 | retain_graph 41 | See corresponding option in `torch.autograd.grad` 42 | allow_unused 43 | See corresponding option in `torch.autograd.grad` 44 | 45 | Returns 46 | ------- 47 | The gradients of `output` with respect to the sequence of inputs. If an input does not require 48 | gradient, the corresponding gradient in the result will be set to `None`. 49 | """ 50 | assert len(inputs) <= len(inputs_need_grad) 51 | 52 | saved_idx = 0 53 | needs_grad = [] 54 | for i, i_grad in enumerate(inputs_need_grad): 55 | if i_grad: 56 | needs_grad.append(inputs[saved_idx]) 57 | if i >= num_nondiff_inputs: 58 | saved_idx += 1 59 | 60 | grads = torch.autograd.grad(output, needs_grad, retain_graph=retain_graph, allow_unused=allow_unused) 61 | 62 | grads_idx = 0 63 | results = [] 64 | for _, i_grad in enumerate(inputs_need_grad): 65 | if i_grad: 66 | results.append(grads[grads_idx]) 67 | grads_idx += 1 68 | else: 69 | results.append(None) 70 | return tuple(results) 71 | 72 | 73 | def calc_grads(ctx, output, num_nondiff_inputs): 74 | return calc_grads_tensors( 75 | ctx.saved_tensors, ctx.needs_input_grad, num_nondiff_inputs, output, retain_graph=True, allow_unused=True 76 | ) 77 | -------------------------------------------------------------------------------- /falkon/hopt/objectives/transforms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributions.constraints as constraints 3 | import torch.nn.functional as F 4 | 5 | 6 | class PositiveTransform(torch.distributions.transforms.Transform): 7 | _cache_size = 0 8 | domain = constraints.real 9 | codomain = constraints.positive 10 | 11 | def __init__(self, lower_bound=0.0): 12 | super().__init__() 13 | self.lower_bound = lower_bound 14 | 15 | def __eq__(self, other): 16 | if not isinstance(other, PositiveTransform): 17 | return False 18 | return other.lower_bound == self.lower_bound 19 | 20 | def _call(self, x): 21 | # softplus and then shift 22 | y = F.softplus(x) 23 | y = y + self.lower_bound 24 | return y 25 | 26 | def _inverse(self, y): 27 | # https://github.com/tensorflow/probability/blob/v0.12.2/tensorflow_probability/python/math/generic.py#L456-L507 28 | x = y - self.lower_bound 29 | 30 | threshold = torch.log(torch.tensor(torch.finfo(y.dtype).eps, dtype=y.dtype)) + torch.tensor(2.0, dtype=y.dtype) 31 | is_too_small = x < torch.exp(threshold) 32 | is_too_large = x > -threshold 33 | too_small_val = torch.log(x) 34 | too_large_val = x 35 | 36 | x = torch.where(is_too_small | is_too_large, torch.tensor(1.0, dtype=y.dtype, device=y.device), x) 37 | x = x + torch.log(-torch.expm1(-x)) 38 | return torch.where(is_too_small, too_small_val, torch.where(is_too_large, too_large_val, x)) 39 | -------------------------------------------------------------------------------- /falkon/hopt/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | from .gd_train import train_complexity_reg, train_complexity_reg_mb 2 | from .grid_search import run_on_grid 3 | from .models import init_model 4 | 5 | __all__ = ( 6 | "train_complexity_reg", 7 | "train_complexity_reg_mb", 8 | "run_on_grid", 9 | "init_model", 10 | ) 11 | -------------------------------------------------------------------------------- /falkon/hopt/optimization/grid_search.py: -------------------------------------------------------------------------------- 1 | import time 2 | from dataclasses import dataclass 3 | from typing import Any, Dict, List, Optional 4 | 5 | import torch 6 | 7 | from falkon.hopt.objectives.objectives import HyperoptObjective 8 | from falkon.hopt.optimization.reporting import pred_reporting, report_losses 9 | from falkon.hopt.utils import get_scalar 10 | 11 | # TODO: THIS IS BROKEN (due to attempting to change the parameters of a nn.Module) 12 | 13 | 14 | @dataclass 15 | class HPGridPoint: 16 | attributes: Dict[str, Any] 17 | results: Optional[Dict[str, float]] = None 18 | 19 | 20 | def set_grid_point(model: HyperoptObjective, grid_point: HPGridPoint): 21 | for attr_name, attr_val in grid_point.attributes.items(): 22 | setattr(model, attr_name, attr_val) 23 | 24 | 25 | def run_on_grid( 26 | Xtr: torch.Tensor, 27 | Ytr: torch.Tensor, 28 | Xts: torch.Tensor, 29 | Yts: torch.Tensor, 30 | model: HyperoptObjective, 31 | grid_spec: List[HPGridPoint], 32 | minibatch: Optional[int], 33 | err_fn, 34 | cuda: bool, 35 | ): 36 | if cuda: 37 | Xtr, Ytr, Xts, Yts = Xtr.cuda(), Ytr.cuda(), Xts.cuda(), Yts.cuda() 38 | 39 | print(f"Starting grid-search on model {model}.") 40 | print(f"Will run for {len(grid_spec)} points.") 41 | 42 | if minibatch is None or minibatch <= 0: 43 | minibatch = Xtr.shape[0] 44 | cum_time = 0 45 | for i, grid_point in enumerate(grid_spec): 46 | e_start = time.time() 47 | set_grid_point(model, grid_point) 48 | losses = [0.0] * len(model.loss_names) 49 | for mb_start in range(0, Xtr.shape[0], minibatch): 50 | Xtr_batch = Xtr[mb_start : mb_start + minibatch, :] 51 | Ytr_batch = Ytr[mb_start : mb_start + minibatch, :] 52 | mb_losses = model.hp_loss(Xtr_batch, Ytr_batch) 53 | for lidx in range(len(mb_losses)): 54 | losses[lidx] += get_scalar(mb_losses[lidx]) 55 | cum_time += time.time() - e_start 56 | grid_point.results = pred_reporting( 57 | model=model, 58 | Xtr=Xtr, 59 | Ytr=Ytr, 60 | Xts=Xts, 61 | Yts=Yts, 62 | resolve_model=True, 63 | err_fn=err_fn, 64 | epoch=i, 65 | cum_time=cum_time, 66 | mb_size=minibatch, 67 | ) 68 | if not model.losses_are_grads: 69 | grid_point.results.update(report_losses(losses, model.loss_names, i)) 70 | return grid_spec 71 | -------------------------------------------------------------------------------- /falkon/hopt/optimization/models.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains a helper function to initialize one of several 3 | hyperparameter optimization objectives. 4 | """ 5 | from typing import Dict, Optional 6 | 7 | import torch 8 | 9 | import falkon.kernels 10 | from falkon import FalkonOptions 11 | from falkon.hopt.objectives import GCV, LOOCV, SGPR, CompReg, HoldOut, NystromCompReg, StochasticNystromCompReg 12 | 13 | 14 | def init_model( 15 | model_type: str, 16 | data: Dict[str, torch.Tensor], 17 | kernel: falkon.kernels.DiffKernel, 18 | penalty_init: torch.Tensor, 19 | centers_init: torch.Tensor, 20 | opt_penalty: bool, 21 | opt_centers: bool, 22 | cuda: bool, 23 | val_pct: Optional[float], 24 | per_iter_split: Optional[bool], 25 | cg_tol: Optional[float], 26 | num_trace_vecs: Optional[int], 27 | flk_maxiter: Optional[int], 28 | ): 29 | flk_opt = FalkonOptions( 30 | cg_tolerance=cg_tol, 31 | use_cpu=not torch.cuda.is_available(), 32 | cg_full_gradient_every=10, 33 | cg_epsilon_32=1e-6, 34 | cg_differential_convergence=True, 35 | ) 36 | 37 | if model_type == "sgpr": 38 | model = SGPR( 39 | kernel=kernel, 40 | penalty_init=penalty_init, 41 | centers_init=centers_init, 42 | opt_penalty=opt_penalty, 43 | opt_centers=opt_centers, 44 | ) 45 | elif model_type == "gcv": 46 | model = GCV( 47 | kernel=kernel, 48 | penalty_init=penalty_init, 49 | centers_init=centers_init, 50 | opt_penalty=opt_penalty, 51 | opt_centers=opt_centers, 52 | ) 53 | elif model_type == "loocv": 54 | model = LOOCV( 55 | kernel=kernel, 56 | penalty_init=penalty_init, 57 | centers_init=centers_init, 58 | opt_penalty=opt_penalty, 59 | opt_centers=opt_centers, 60 | ) 61 | elif model_type == "holdout": 62 | if val_pct is None: 63 | raise ValueError("val_pct must be specified for model_type='holdout'") 64 | if val_pct <= 0 or val_pct >= 100: 65 | raise RuntimeError("val_pct must be between 1 and 99") 66 | model = HoldOut( 67 | kernel=kernel, 68 | penalty_init=penalty_init, 69 | centers_init=centers_init, 70 | opt_centers=opt_centers, 71 | opt_penalty=opt_penalty, 72 | val_pct=val_pct, 73 | per_iter_split=per_iter_split, 74 | ) 75 | elif model_type == "creg-notrace": 76 | model = CompReg( 77 | kernel=kernel, 78 | penalty_init=penalty_init, 79 | centers_init=centers_init, 80 | opt_penalty=opt_penalty, 81 | opt_centers=opt_centers, 82 | ) 83 | elif model_type == "creg-penfit": 84 | model = NystromCompReg( 85 | kernel=kernel, 86 | penalty_init=penalty_init, 87 | centers_init=centers_init, 88 | opt_penalty=opt_penalty, 89 | opt_centers=opt_centers, 90 | ) 91 | elif model_type == "stoch-creg-penfit": 92 | model = StochasticNystromCompReg( 93 | kernel=kernel, 94 | penalty_init=penalty_init, 95 | centers_init=centers_init, 96 | opt_penalty=opt_penalty, 97 | opt_centers=opt_centers, 98 | flk_opt=flk_opt, 99 | num_trace_est=num_trace_vecs, 100 | flk_maxiter=flk_maxiter, 101 | ) 102 | else: 103 | raise RuntimeError(f"{model_type} model type not recognized!") 104 | 105 | if cuda: 106 | model = model.cuda() 107 | 108 | return model 109 | -------------------------------------------------------------------------------- /falkon/hopt/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Union 3 | 4 | import torch 5 | 6 | 7 | def get_scalar(t: Union[torch.Tensor, float]) -> float: 8 | if isinstance(t, torch.Tensor): 9 | if t.dim() == 0: 10 | return deepcopy(t.detach().cpu().item()) 11 | return deepcopy(torch.flatten(t)[0].detach().cpu().item()) 12 | return t 13 | -------------------------------------------------------------------------------- /falkon/kernels/__init__.py: -------------------------------------------------------------------------------- 1 | from .kernel import Kernel # isort: skip 2 | from .keops_helpers import KeopsKernelMixin # isort: skip 3 | from .diff_kernel import DiffKernel 4 | from .distance_kernel import GaussianKernel, LaplacianKernel, MaternKernel 5 | from .dot_prod_kernel import LinearKernel, PolynomialKernel, SigmoidKernel 6 | from .precomputed_kernel import PrecomputedKernel 7 | 8 | __all__ = ( 9 | "Kernel", 10 | "DiffKernel", 11 | "KeopsKernelMixin", 12 | "GaussianKernel", 13 | "LaplacianKernel", 14 | "MaternKernel", 15 | "LinearKernel", 16 | "PolynomialKernel", 17 | "SigmoidKernel", 18 | "PrecomputedKernel", 19 | ) 20 | -------------------------------------------------------------------------------- /falkon/kernels/precomputed_kernel.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from torch import Tensor 5 | 6 | from falkon.kernels import Kernel 7 | from falkon.mmv_ops.fmmv_incore import incore_fdmmv, incore_fmmv 8 | from falkon.options import FalkonOptions 9 | from falkon.sparse import SparseTensor 10 | from falkon.utils.helpers import check_same_dtype 11 | 12 | 13 | class PrecomputedKernel(Kernel): 14 | def __init__(self, k: Tensor, opt: Optional[FalkonOptions] = None): 15 | super().__init__("precomputed", opt) 16 | self.k = k 17 | 18 | def compute(self, X1: Tensor, X2: Tensor, out: Tensor, diag: bool, **kwargs) -> Tensor: 19 | raise NotImplementedError() 20 | 21 | def compute_sparse(self, X1: SparseTensor, X2: SparseTensor, out: Tensor, diag: bool, **kwargs) -> Tensor: 22 | raise NotImplementedError() 23 | 24 | def _decide_mmv_impl(self, X1, X2, v, opt): 25 | return self.mmv_impl 26 | 27 | def mmv_impl(self, X1, X2, v, out, opt, **kwargs) -> Tensor: 28 | # decide whether we must transpose based on shapes of X1, X2. No error checking here 29 | if self.k.shape[1] == v.shape[0]: 30 | transpose = False 31 | else: 32 | transpose = True 33 | return incore_fmmv(self.k, v, out, transpose=transpose, opt=opt) 34 | 35 | def _decide_dmmv_impl(self, X1, X2, v, w, opt): 36 | return self.dmmv_impl 37 | 38 | def dmmv_impl(self, v, w, out, opt, **kwargs) -> Tensor: 39 | return incore_fdmmv(self.k, v, w, out, opt=opt) 40 | 41 | def _decide_mm_impl(self, X1, X2, diag, opt): 42 | return self.mm_impl 43 | 44 | def mm_impl(self, out: Optional[Tensor], diag: bool, **kwargs) -> Tensor: 45 | k = self.k 46 | if diag: 47 | k = torch.diagonal(k) 48 | if out is not None: 49 | return out.copy_(k) 50 | return k 51 | 52 | @staticmethod 53 | def _check_device_properties(*args, fn_name: str, opt: FalkonOptions): 54 | pass 55 | 56 | @staticmethod 57 | def _check_mm_dimensions(X1: torch.Tensor, X2: torch.Tensor, diag: bool, out: Optional[torch.Tensor]): 58 | return X1, X2, out 59 | 60 | @staticmethod 61 | def _check_mmv_dimensions(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, out: Optional[torch.Tensor]): 62 | if v.dim() == 1: 63 | v = v.reshape((-1, 1)) 64 | if v.dim() != 2: 65 | raise ValueError(f"v must be a vector or a 2D matrix. Found {len(v.shape)}D.") 66 | 67 | if not check_same_dtype(v, out): 68 | raise TypeError("Data types of input matrices must be equal.") 69 | 70 | return X1, X2, v, out 71 | 72 | @staticmethod 73 | def _check_dmmv_dimensions(X1, X2, v, w, out): 74 | # Parameter validation 75 | if v is None and w is None: 76 | raise ValueError("One of v and w must be specified to run fdMMV.") 77 | 78 | if v is not None and v.dim() == 1: 79 | v = v.reshape((-1, 1)) 80 | if v is not None and v.dim() != 2: 81 | raise ValueError(f"v must be a vector or a 2D matrix. Found {len(v.shape)}D.") 82 | if w is not None and w.dim() == 1: 83 | w = w.reshape((-1, 1)) 84 | if w is not None and w.dim() != 2: 85 | raise ValueError(f"w must be a vector or a 2D matrix. Found {len(w.shape)}D.") 86 | 87 | if not check_same_dtype(v, w, out): 88 | raise TypeError("Data types of input matrices must be equal.") 89 | 90 | return X1, X2, v, w, out 91 | -------------------------------------------------------------------------------- /falkon/la_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .wrapper import copy_triang, mul_triang, potrf, square_norm, trsm, vec_mul_triang, zero_triang 2 | 3 | __all__ = ( 4 | "zero_triang", 5 | "mul_triang", 6 | "copy_triang", 7 | "vec_mul_triang", 8 | "potrf", 9 | "trsm", 10 | "square_norm", 11 | ) 12 | -------------------------------------------------------------------------------- /falkon/la_helpers/cpu_trsm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.linalg import blas as sclb 3 | 4 | from falkon.utils.helpers import choose_fn 5 | 6 | 7 | def cpu_trsm(A: np.ndarray, v: np.ndarray, alpha: float, lower: int, transpose: int) -> np.ndarray: 8 | # Run the CPU version of TRSM. Now everything is numpy. 9 | trsm_fn = choose_fn(A.dtype, sclb.dtrsm, sclb.strsm, "TRSM") 10 | vF = np.copy(v, order="F") 11 | trsm_fn(alpha, A, vF, side=0, lower=lower, trans_a=transpose, overwrite_b=1) 12 | if not v.flags.f_contiguous: 13 | vF = np.copy(vF, order="C") 14 | return vF 15 | -------------------------------------------------------------------------------- /falkon/la_helpers/cuda_trsm.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from falkon.c_ext import copy_transpose, cublas_trsm 6 | from falkon.utils.helpers import check_same_device 7 | from falkon.utils.tensor_helpers import create_C, create_fortran, is_f_contig 8 | 9 | 10 | def cuda_trsm( 11 | A: torch.Tensor, 12 | v: torch.Tensor, 13 | alpha: float, 14 | lower: bool, 15 | transpose: bool, 16 | stream: Optional[torch.cuda.Stream] = None, 17 | ) -> torch.Tensor: 18 | if not is_f_contig(A, strict=False): 19 | raise ValueError("A must be f-contiguous for CUDA TRSM to work.") 20 | if not check_same_device(A, v): 21 | raise ValueError("A and v must be on the same CUDA device.") 22 | if not A.is_cuda: 23 | raise ValueError("A and v must be CUDA tensors!") 24 | 25 | device = A.device 26 | s = stream 27 | if stream is None: 28 | s = torch.cuda.current_stream(device=device) 29 | 30 | # noinspection PyProtectedMember 31 | with torch.cuda.device(device), torch.cuda.stream(s): 32 | # Deal with copying v, which may not be F-contiguous. 33 | vF = create_fortran(v.size(), v.dtype, device) 34 | if is_f_contig(v, strict=False): 35 | # We can just make a copy of v 36 | vF.copy_(v) 37 | s.synchronize() # sync is necessary here for correctness. Not sure why! TODO: Is it still needed? 38 | else: 39 | vF = copy_transpose(v, out=vF.T).T 40 | 41 | cublas_trsm( 42 | A=A, 43 | lda=A.stride(1), 44 | B=vF, 45 | ldb=vF.stride(1), 46 | alpha=alpha, 47 | left=True, 48 | upper=not lower, 49 | transpose=transpose, 50 | unitriangular=False, 51 | m=vF.shape[0], 52 | n=vF.shape[1], 53 | ) 54 | if is_f_contig(v, strict=False): 55 | vout = vF 56 | else: 57 | vout = create_C(v.size(), v.dtype, device) 58 | vout = copy_transpose(vF, out=vout.T).T 59 | return vout 60 | -------------------------------------------------------------------------------- /falkon/mkl_bindings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/mkl_bindings/__init__.py -------------------------------------------------------------------------------- /falkon/mmv_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/mmv_ops/__init__.py -------------------------------------------------------------------------------- /falkon/mmv_ops/fmmv_incore.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | from falkon.options import FalkonOptions 6 | from falkon.utils.helpers import check_same_device, check_same_dtype 7 | from falkon.utils.tensor_helpers import create_same_stride 8 | 9 | __all__ = ( 10 | "incore_fmmv", 11 | "incore_fdmmv", 12 | ) 13 | 14 | 15 | def incore_fmmv( 16 | mat: torch.Tensor, 17 | vec: torch.Tensor, 18 | out: Optional[torch.Tensor] = None, 19 | transpose: bool = False, 20 | opt: Optional[FalkonOptions] = None, 21 | ) -> torch.Tensor: 22 | if not check_same_dtype(mat, vec, out): 23 | raise TypeError("Data types of input matrices must be equal.") 24 | if not check_same_device(mat, vec, out): 25 | raise RuntimeError("All input arguments to incore_fmmv must be on the same device") 26 | 27 | if out is None: 28 | if transpose: 29 | out_shape = (mat.shape[1], vec.shape[1]) 30 | else: 31 | out_shape = (mat.shape[0], vec.shape[1]) 32 | out = create_same_stride(out_shape, mat, mat.dtype, device=mat.device, pin_memory=False) 33 | out.fill_(0.0) 34 | 35 | if transpose: 36 | out.addmm_(mat.T, vec, beta=0.0) 37 | else: 38 | out.addmm_(mat, vec, beta=0.0) 39 | return out 40 | 41 | 42 | def incore_fdmmv( 43 | mat: torch.Tensor, 44 | vec: torch.Tensor, 45 | w: Optional[torch.Tensor], 46 | out: Optional[torch.Tensor] = None, 47 | opt: Optional[FalkonOptions] = None, 48 | ) -> torch.Tensor: 49 | out1 = incore_fmmv(mat, vec, None, False, opt) 50 | if w is not None: 51 | out1.add_(w) 52 | return incore_fmmv(mat, out1, out, True, opt) 53 | -------------------------------------------------------------------------------- /falkon/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .falkon import Falkon 2 | from .incore_falkon import InCoreFalkon 3 | from .logistic_falkon import LogisticFalkon 4 | 5 | __all__ = ( 6 | "Falkon", 7 | "LogisticFalkon", 8 | "InCoreFalkon", 9 | ) 10 | -------------------------------------------------------------------------------- /falkon/ooc_ops/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .ooc_lauum import gpu_lauum 3 | from .ooc_potrf import gpu_cholesky 4 | except (OSError, ModuleNotFoundError): 5 | # No GPU 6 | gpu_lauum = None 7 | gpu_cholesky = None 8 | 9 | __all__ = ("gpu_lauum", "gpu_cholesky") 10 | -------------------------------------------------------------------------------- /falkon/ooc_ops/ooc_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import List 3 | 4 | 5 | def calc_block_sizes(max_block_size: int, num_devices: int, num_rows: int, min_blocks_per_device: int) -> List[int]: 6 | min_num_blocks = int(math.ceil(num_rows / max_block_size)) 7 | num_blocks = max(min_num_blocks, num_devices, min_blocks_per_device) 8 | if num_blocks % num_devices != 0: # even number of blocks per GPU 9 | num_blocks += num_devices - (num_blocks % num_devices) 10 | if num_blocks <= 0: 11 | raise RuntimeError("num_blocks expected > 0, found %d" % (num_blocks)) 12 | # Calculate a block size which evenly splits N 13 | block_size, extras = divmod(num_rows, num_blocks) 14 | block_sizes = extras * [block_size + 1] + (num_blocks - extras) * [block_size] 15 | 16 | return block_sizes 17 | 18 | 19 | def calc_block_sizes3(max_block_size: int, num_devices: int, num_rows: int) -> List[int]: 20 | preferred_block_size = 7000 21 | # Shortcircuit small matrices 22 | if num_rows < 1024 and num_rows <= max_block_size: # Single block on one GPU 23 | return [num_rows] 24 | # If we have very small block size, we don't want any block to be larger than it 25 | if preferred_block_size > max_block_size: 26 | preferred_block_size = max_block_size 27 | 28 | num_blocks = int(math.ceil(num_rows / preferred_block_size)) 29 | 30 | # Ensure an even distribution of blocks between GPUs 31 | if num_blocks % num_devices != 0 and num_blocks < num_rows: # even number of blocks per GPU 32 | added_blocks = num_devices - (num_blocks % num_devices) 33 | # Ensure that we don't get into num_blocks > num_rows, which then creates blocks of size 0. 34 | if num_blocks + added_blocks <= num_rows: 35 | num_blocks += added_blocks 36 | 37 | block_size, extras = divmod(num_rows, num_blocks) 38 | block_sizes = extras * [block_size + 1] + (num_blocks - extras) * [block_size] 39 | return block_sizes 40 | -------------------------------------------------------------------------------- /falkon/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .conjgrad import ConjugateGradient, FalkonConjugateGradient, Optimizer 2 | 3 | __all__ = ("Optimizer", "ConjugateGradient", "FalkonConjugateGradient") 4 | -------------------------------------------------------------------------------- /falkon/preconditioner/__init__.py: -------------------------------------------------------------------------------- 1 | from .flk_preconditioner import FalkonPreconditioner 2 | from .logistic_preconditioner import LogisticPreconditioner 3 | from .preconditioner import Preconditioner 4 | 5 | __all__ = ("FalkonPreconditioner", "Preconditioner", "LogisticPreconditioner") 6 | -------------------------------------------------------------------------------- /falkon/preconditioner/pc_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import torch 4 | from scipy.linalg import lapack as scll 5 | 6 | from falkon.la_helpers import potrf 7 | from falkon.options import FalkonOptions 8 | from falkon.utils.helpers import choose_fn 9 | 10 | __all__ = ("check_init", "inplace_set_diag_th", "inplace_add_diag_th", "lauum_wrapper", "potrf_wrapper") 11 | 12 | 13 | def check_init(*none_check): 14 | def _checker(fun): 15 | @functools.wraps(fun) 16 | def wrapper(self, *args, **kwargs): 17 | is_init = True 18 | for el in none_check: 19 | if getattr(self, el, None) is None: 20 | is_init = False 21 | break 22 | if not is_init: 23 | raise RuntimeError( 24 | "FALKON preconditioner is not initialized. Please run " 25 | "`init` before any other method on the " 26 | "preconditioner." 27 | ) 28 | return fun(self, *args, **kwargs) 29 | 30 | return wrapper 31 | 32 | return _checker 33 | 34 | 35 | def inplace_set_diag_th(A: torch.Tensor, k: torch.Tensor) -> torch.Tensor: 36 | A.diagonal().copy_(k) 37 | return A 38 | 39 | 40 | def inplace_add_diag_th(A: torch.Tensor, k: float) -> torch.Tensor: 41 | # Assumes M is square (or wide also works). 42 | # Need to use .diagonal() as .diag() makes a copy 43 | A.diagonal().add_(k) 44 | return A 45 | 46 | 47 | def lauum_wrapper(A: torch.Tensor, upper: bool, use_cuda: bool, opt: FalkonOptions) -> torch.Tensor: 48 | if use_cuda: 49 | from falkon.ooc_ops.ooc_lauum import gpu_lauum 50 | 51 | return gpu_lauum(A, upper=upper, write_opposite=True, overwrite=True, opt=opt) 52 | else: 53 | Anp = A.numpy() 54 | lauum = choose_fn(Anp.dtype, scll.dlauum, scll.slauum, "LAUUM") 55 | sol, info = lauum(Anp, lower=int(not upper), overwrite_c=1) 56 | if info != 0: 57 | raise RuntimeError(f"Lapack LAUUM failed with error code {info}.") 58 | return torch.from_numpy(sol) 59 | 60 | 61 | def potrf_wrapper(A: torch.Tensor, clean: bool, upper: bool, use_cuda: bool, opt: FalkonOptions) -> torch.Tensor: 62 | if use_cuda: 63 | from falkon.ooc_ops.ooc_potrf import gpu_cholesky 64 | 65 | return gpu_cholesky(A, upper=upper, clean=clean, overwrite=True, opt=opt) 66 | else: 67 | return potrf(A, upper=upper, clean=clean, overwrite=True, cuda=False) 68 | -------------------------------------------------------------------------------- /falkon/preconditioner/preconditioner.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Preconditioner(ABC): 5 | """Generic preconditioner class, used to accelerate solutions to linear systems. 6 | 7 | Given a system of equations :math:`H\\beta = Y`, where :math:`H` typically contains in some 8 | form our data matrix `X` and `Y` contains the targets. We can use matrix :math:`B` to 9 | create an equivalent linear system which will have lower condition number: 10 | 11 | .. math:: 12 | 13 | BB^\\top H \\beta = Y 14 | 15 | where :math:`BB^\\top \\approx H^{-1}` in order to make the preconditioner effective, but not 16 | too expensive to compute. Then, in order to use the preconditioner in an algorithm based 17 | on matrix-vector products (such as conjugate gradient descent), we must be able to "apply" the 18 | matrix :math:`B` and its transpose :math:`B^\top` to any vector. 19 | 20 | For this reason, this class exposes abstract methods `apply` and `apply_t` which should 21 | be overridden in concrete preconditioner implementations 22 | 23 | See Also 24 | -------- 25 | :class:`falkon.preconditioner.FalkonPreconditioner` : 26 | for an actual preconditioner implementation 27 | """ 28 | 29 | @abstractmethod 30 | def apply(self, v): 31 | pass 32 | 33 | @abstractmethod 34 | def apply_t(self, v): 35 | pass 36 | -------------------------------------------------------------------------------- /falkon/sparse/__init__.py: -------------------------------------------------------------------------------- 1 | from .sparse_ops import bdot, sparse_matmul, sparse_norm, sparse_square_norm 2 | from .sparse_tensor import SparseTensor, SparseType 3 | 4 | __all__ = ( 5 | "SparseTensor", 6 | "SparseType", 7 | "sparse_norm", 8 | "sparse_matmul", 9 | "sparse_square_norm", 10 | "bdot", 11 | ) 12 | -------------------------------------------------------------------------------- /falkon/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FalkonML/falkon/5559fd6716a2b1b98f480f5c456a6a7b86ff72a3/falkon/tests/__init__.py -------------------------------------------------------------------------------- /falkon/tests/gen_random.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | import torch 4 | 5 | from falkon.c_ext import copy_triang 6 | from falkon.sparse.sparse_tensor import SparseTensor 7 | 8 | 9 | def gen_random_multi(*sizes, dtype, F=False, seed=0): 10 | rng = np.random.default_rng(seed) 11 | out = rng.random(size=tuple(sizes), dtype=dtype) 12 | if F: 13 | return np.asfortranarray(out) 14 | return out 15 | 16 | 17 | def gen_random(a, b, dtype, F=False, seed=0): 18 | rng = np.random.default_rng(seed) 19 | out = rng.random(size=(a, b), dtype=dtype) 20 | if F: 21 | return np.asfortranarray(out) 22 | return out 23 | 24 | 25 | def gen_random_pd(t, dtype, F=False, seed=0): 26 | A = torch.from_numpy(gen_random(t, t, dtype, F, seed)) 27 | copy_triang(A, upper=True) 28 | A *= 1 29 | A += 2 30 | A += torch.eye(t, dtype=A.dtype) * t * 4 31 | return A 32 | 33 | 34 | def gen_sparse_matrix(a, b, dtype, density=0.1, seed=0, sparse_fromat="csr") -> SparseTensor: 35 | out = random_sparse(a, b, density=density, sparse_format=sparse_fromat, dtype=dtype, seed=seed) 36 | 37 | return SparseTensor.from_scipy(out) 38 | 39 | 40 | def random_sparse(m, n, density=0.01, sparse_format="coo", dtype=None, seed=None, data_rvs=None): 41 | # noinspection PyArgumentList 42 | dtype = np.dtype(dtype) 43 | mn = m * n 44 | tp = np.intc 45 | if mn > np.iinfo(tp).max: 46 | tp = np.int64 47 | 48 | # Number of non zero values 49 | k = int(density * m * n) 50 | 51 | random_state = np.random.RandomState(seed) 52 | 53 | if data_rvs is None: 54 | data_rvs = random_state.rand 55 | 56 | generator = np.random.default_rng(seed) 57 | ind = generator.choice(mn, size=k, replace=False) 58 | 59 | j = np.floor(ind * 1.0 / m).astype(tp, copy=False) 60 | i = (ind - j * m).astype(tp, copy=False) 61 | # noinspection PyArgumentList 62 | vals = data_rvs(k).astype(dtype, copy=False) 63 | return scipy.sparse.coo_matrix((vals, (i, j)), shape=(m, n)).asformat(sparse_format, copy=False) 64 | -------------------------------------------------------------------------------- /falkon/tests/helpers.py: -------------------------------------------------------------------------------- 1 | __all__ = () 2 | -------------------------------------------------------------------------------- /falkon/tests/naive_kernels.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import torch 5 | from scipy.spatial.distance import cdist 6 | 7 | __all__ = ( 8 | "naive_gaussian_kernel", 9 | "naive_sigmoid_kernel", 10 | "naive_laplacian_kernel", 11 | "naive_linear_kernel", 12 | "naive_polynomial_kernel", 13 | "naive_matern_kernel", 14 | "naive_diff_gaussian_kernel", 15 | "naive_diff_sigmoid_kernel", 16 | "naive_diff_laplacian_kernel", 17 | "naive_diff_linear_kernel", 18 | "naive_diff_polynomial_kernel", 19 | "naive_diff_matern_kernel", 20 | ) 21 | 22 | 23 | def naive_diff_gaussian_kernel(X1, X2, sigma): 24 | pairwise_dists = torch.cdist(X1 / sigma, X2 / sigma, p=2).square() 25 | return torch.exp(-0.5 * pairwise_dists) 26 | 27 | 28 | def naive_diff_laplacian_kernel(X1, X2, sigma): 29 | # http://crsouza.com/2010/03/17/kernel-functions-for-machine-learning-applications/#laplacian 30 | pairwise_dists = torch.cdist(X1 / sigma, X2 / sigma, p=2) 31 | return torch.exp(-pairwise_dists) 32 | 33 | 34 | def naive_diff_linear_kernel(X1, X2, beta, gamma): 35 | return naive_linear_kernel(X1, X2, beta, gamma) 36 | 37 | 38 | def naive_diff_sigmoid_kernel(X1, X2, gamma, beta): 39 | out = X1 @ X2.T 40 | return torch.tanh(out * gamma + beta) 41 | 42 | 43 | def naive_diff_polynomial_kernel(X1, X2, gamma, beta, degree): 44 | out = X1 @ X2.T 45 | return torch.pow(out * gamma + beta, degree) 46 | 47 | 48 | def naive_diff_matern_kernel(X1, X2, sigma, nu): 49 | pairwise_dists = torch.cdist(X1 / sigma, X2 / sigma, p=2) 50 | 51 | if nu == 0.5: 52 | K = torch.exp(-pairwise_dists) 53 | elif nu == 1.5: 54 | K = pairwise_dists * math.sqrt(3) 55 | K = (1.0 + K) * torch.exp(-K) 56 | elif nu == 2.5: 57 | K = pairwise_dists * math.sqrt(5) 58 | K = (1.0 + K + K**2 / 3.0) * torch.exp(-K) 59 | elif nu == np.inf: 60 | K = torch.exp(-(pairwise_dists**2) / 2.0) 61 | return K 62 | 63 | 64 | def naive_gaussian_kernel(X1, X2, sigma): 65 | pairwise_dists = cdist(X1, X2, "sqeuclidean") 66 | return np.exp(-pairwise_dists / (2 * sigma**2)) 67 | 68 | 69 | def naive_laplacian_kernel(X1, X2, sigma): 70 | pairwise_dists = cdist(X1, X2, "euclidean") 71 | return np.exp(-pairwise_dists / sigma) 72 | 73 | 74 | def naive_linear_kernel(X1, X2, beta, gamma): 75 | return beta + gamma * X1 @ X2.T 76 | 77 | 78 | def naive_sigmoid_kernel(X1, X2, alpha, beta): 79 | out = X1 @ X2.T 80 | return np.tanh(out * alpha + beta) 81 | 82 | 83 | def naive_polynomial_kernel(X1, X2, alpha, beta, degree): 84 | out = X1 @ X2.T 85 | return np.power(out * alpha + beta, degree) 86 | 87 | 88 | def naive_matern_kernel(X1, X2, sigma, nu): 89 | pairwise_dists = cdist(X1 / sigma, X2 / sigma, "euclidean") 90 | 91 | if nu == 0.5: 92 | K = np.exp(-pairwise_dists) 93 | elif nu == 1.5: 94 | K = pairwise_dists * math.sqrt(3) 95 | K = (1.0 + K) * np.exp(-K) 96 | elif nu == 2.5: 97 | K = pairwise_dists * math.sqrt(5) 98 | K = (1.0 + K + K**2 / 3.0) * np.exp(-K) 99 | elif nu == np.inf: 100 | K = np.exp(-(pairwise_dists**2) / 2.0) 101 | return K 102 | -------------------------------------------------------------------------------- /falkon/tests/test_gsc_losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | 5 | from falkon import kernels 6 | from falkon.gsc_losses import LogisticLoss, WeightedCrossEntropyLoss 7 | 8 | 9 | def naive_logistic_loss(true, pred): 10 | return torch.log(1 + torch.exp(-true * pred)) 11 | 12 | 13 | def naive_bce(true, pred, weight): 14 | return -(true * torch.log(torch.sigmoid(pred)) + weight * (1 - true) * torch.log(1 - torch.sigmoid(pred))) 15 | 16 | 17 | def derivative_test(diff_fn, loss, pred, true): 18 | o_true, o_pred = true.clone(), pred.clone() 19 | exp = diff_fn(true, pred) 20 | 21 | exp_d = [ 22 | torch.autograd.grad(exp[i], pred, retain_graph=True, create_graph=True)[0][i] for i in range(pred.shape[0]) 23 | ] 24 | exp_dd = [torch.autograd.grad(exp_d[i], pred, retain_graph=True)[0][i] for i in range(pred.shape[0])] 25 | 26 | pred = pred.detach() 27 | np.testing.assert_allclose(exp.detach().numpy(), loss(true, pred).detach().numpy()) 28 | np.testing.assert_allclose(o_true.detach(), true.detach()) 29 | np.testing.assert_allclose(o_pred.detach(), pred.detach()) 30 | np.testing.assert_allclose([e.item() for e in exp_d], loss.df(true, pred).detach().numpy()) 31 | np.testing.assert_allclose(o_true.detach(), true.detach()) 32 | np.testing.assert_allclose(o_pred.detach(), pred.detach()) 33 | np.testing.assert_allclose([e.item() for e in exp_dd], loss.ddf(true, pred).detach().numpy()) 34 | np.testing.assert_allclose(o_true.detach(), true.detach()) 35 | np.testing.assert_allclose(o_pred.detach(), pred.detach()) 36 | 37 | 38 | def test_logistic_loss_derivative(): 39 | kernel = kernels.GaussianKernel(3) 40 | pred = torch.linspace(-10, 10, 10, dtype=torch.float64).requires_grad_() 41 | true = torch.bernoulli(torch.ones_like(pred, dtype=torch.float64) / 2) * 2 - 1 # -1, +1 random values 42 | log_loss = LogisticLoss(kernel) 43 | 44 | derivative_test(naive_logistic_loss, log_loss, pred, true) 45 | 46 | 47 | def test_bce_derivative(): 48 | kernel = kernels.GaussianKernel(3) 49 | pred = torch.linspace(-10, 10, 10, dtype=torch.float64).requires_grad_() 50 | true = torch.bernoulli(torch.ones_like(pred, dtype=torch.float64) / 2) # 0, +1 random values 51 | 52 | neg_weight = 1 53 | wbce_loss = WeightedCrossEntropyLoss(kernel, neg_weight=neg_weight) 54 | 55 | derivative_test(lambda t, p: naive_bce(t, p, neg_weight), wbce_loss, pred, true) 56 | 57 | 58 | def test_weighted_bce_derivative(): 59 | kernel = kernels.GaussianKernel(3) 60 | pred = torch.linspace(-10, 10, 10, dtype=torch.float64).requires_grad_() 61 | true = torch.bernoulli(torch.ones_like(pred, dtype=torch.float64) / 2) # 0, +1 random values 62 | 63 | neg_weight = 0.5 64 | wbce_loss = WeightedCrossEntropyLoss(kernel, neg_weight=neg_weight) 65 | 66 | derivative_test(lambda t, p: naive_bce(t, p, neg_weight), wbce_loss, pred, true) 67 | 68 | 69 | if __name__ == "__main__": 70 | pytest.main() 71 | -------------------------------------------------------------------------------- /falkon/tests/test_logistic_falkon.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from sklearn import datasets 5 | 6 | from falkon import kernels 7 | from falkon.gsc_losses import LogisticLoss 8 | from falkon.models.logistic_falkon import LogisticFalkon 9 | from falkon.options import FalkonOptions 10 | 11 | 12 | @pytest.fixture 13 | def data(): 14 | X, Y = datasets.make_classification(1000, 10, n_classes=2, random_state=11) 15 | X = X.astype(np.float64) 16 | Y = Y.astype(np.float64).reshape(-1, 1) 17 | Y[Y == 0] = -1 18 | return torch.from_numpy(X), torch.from_numpy(Y) 19 | 20 | 21 | class TestLogisticFalkon: 22 | def test_simple(self, data): 23 | X, Y = data 24 | kernel = kernels.GaussianKernel(3.0) 25 | loss = LogisticLoss(kernel=kernel) 26 | 27 | def error_fn(t, p): 28 | return float(100 * torch.sum(t * p <= 0)) / t.shape[0], "c-err" 29 | 30 | opt = FalkonOptions(use_cpu=True, keops_active="no", debug=True) 31 | 32 | logflk = LogisticFalkon( 33 | kernel=kernel, 34 | loss=loss, 35 | penalty_list=[1e-1, 1e-3, 1e-5, 1e-8, 1e-8, 1e-8, 1e-8, 1e-8], 36 | iter_list=[3, 3, 3, 3, 8, 8, 8, 8], 37 | M=500, 38 | seed=10, 39 | options=opt, 40 | error_fn=error_fn, 41 | ) 42 | logflk.fit(X, Y) 43 | preds = logflk.predict(X) 44 | err = error_fn(preds, Y)[0] 45 | assert err < 0.1 46 | -------------------------------------------------------------------------------- /falkon/tests/test_trsm_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from scipy.linalg import blas as sclb 4 | 5 | from falkon.la_helpers import trsm 6 | from falkon.tests.conftest import fix_mat 7 | from falkon.tests.gen_random import gen_random 8 | from falkon.utils import decide_cuda 9 | from falkon.utils.tensor_helpers import move_tensor 10 | 11 | M = 50 12 | T = 30 13 | 14 | 15 | @pytest.fixture(scope="module") 16 | def mat(): 17 | return gen_random(M, M, "float64", F=True, seed=10) 18 | 19 | 20 | @pytest.fixture(scope="module") 21 | def arr(): 22 | return gen_random(M, T, "float64", F=True, seed=12) 23 | 24 | 25 | @pytest.mark.parametrize("order", ["C", "F"]) 26 | @pytest.mark.parametrize("dtype", [np.float32, pytest.param(np.float64, marks=pytest.mark.full())]) 27 | @pytest.mark.parametrize("lower", [True, False], ids=["lower", "upper"]) 28 | @pytest.mark.parametrize("transpose", [True, False], ids=["transpose", "no_transpose"]) 29 | @pytest.mark.parametrize( 30 | "device", 31 | [ 32 | pytest.param("cpu"), 33 | pytest.param("cuda:0", marks=[pytest.mark.skipif(not decide_cuda(), reason="No GPU found.")]), 34 | ], 35 | ) 36 | def test_trsm_wrapper(mat, arr, dtype, order, device, lower, transpose): 37 | rtol = 1e-2 if dtype == np.float32 else 1e-11 38 | 39 | n_mat = move_tensor(fix_mat(mat, dtype=dtype, order=order, copy=True), device=device) 40 | n_arr = move_tensor(fix_mat(arr, dtype=dtype, order=order, copy=True), device=device) 41 | 42 | expected = sclb.dtrsm(1e-2, mat, arr, side=0, lower=lower, trans_a=transpose, overwrite_b=0) 43 | 44 | if device.startswith("cuda") and order == "C": 45 | with pytest.raises(ValueError): 46 | actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) 47 | else: 48 | actual = trsm(n_arr, n_mat, alpha=1e-2, lower=lower, transpose=transpose) 49 | np.testing.assert_allclose(expected, actual.cpu().numpy(), rtol=rtol) 50 | 51 | 52 | if __name__ == "__main__": 53 | pytest.main() 54 | -------------------------------------------------------------------------------- /falkon/tests/test_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import scipy.sparse 4 | import torch 5 | 6 | import falkon.preconditioner.pc_utils 7 | from falkon.sparse.sparse_tensor import SparseTensor 8 | from falkon.tests.gen_random import gen_random 9 | from falkon.utils.helpers import check_same_dtype, check_sparse, sizeof_dtype 10 | 11 | 12 | @pytest.mark.parametrize("F", [True, False], ids=["col-contig", "row-contig"]) 13 | def test_add_diag(F): 14 | A = torch.from_numpy(gen_random(1000, 1000, "float64", F=F, seed=10)) 15 | diag = 10**6 16 | falkon.preconditioner.pc_utils.inplace_add_diag_th(A, diag) 17 | assert torch.all((A.diagonal() > 10**5) & (A.diagonal() < 20**6)) 18 | 19 | 20 | def test_check_same_dtype_equal(): 21 | smat = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 1]]).astype(np.float32)) 22 | ts = [torch.tensor(0, dtype=torch.float32), SparseTensor.from_scipy(smat), None] 23 | assert check_same_dtype(*ts) is True 24 | 25 | 26 | def test_check_same_dtype_empty(): 27 | assert check_same_dtype() is True 28 | 29 | 30 | def test_check_same_dtype_notequal(): 31 | smat32 = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 1]]).astype(np.float32)) 32 | smat64 = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 1]]).astype(np.float64)) 33 | ts = [ 34 | torch.tensor(0, dtype=torch.float32), 35 | torch.tensor(0, dtype=torch.float64), 36 | SparseTensor.from_scipy(smat32), 37 | ] 38 | assert check_same_dtype(*ts) is False 39 | 40 | ts = [ 41 | torch.tensor(0, dtype=torch.float32), 42 | SparseTensor.from_scipy(smat32), 43 | SparseTensor.from_scipy(smat64), 44 | ] 45 | assert check_same_dtype(*ts) is False 46 | 47 | 48 | def test_size_of_dtype(): 49 | assert 8 == sizeof_dtype(np.float64) 50 | assert 4 == sizeof_dtype(np.float32) 51 | with pytest.raises(TypeError): 52 | sizeof_dtype(np.int32) 53 | 54 | assert 8 == sizeof_dtype(torch.float64) 55 | assert 4 == sizeof_dtype(torch.float32) 56 | with pytest.raises(TypeError): 57 | sizeof_dtype(torch.int32) 58 | 59 | 60 | def test_check_sparse(): 61 | smat = scipy.sparse.csr_matrix(np.array([[0, 1], [0, 1]]).astype(np.float32)) 62 | st = SparseTensor.from_scipy(smat) 63 | 64 | assert [False, True] == check_sparse(torch.tensor(0), st) 65 | assert [] == check_sparse() 66 | 67 | 68 | if __name__ == "__main__": 69 | pytest.main() 70 | -------------------------------------------------------------------------------- /falkon/utils/.gitignore: -------------------------------------------------------------------------------- 1 | cyblas.html 2 | -------------------------------------------------------------------------------- /falkon/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | 3 | import numpy as np 4 | 5 | from .switches import decide_cuda 6 | from .threading import PropagatingThread 7 | from .tictoc import TicToc 8 | 9 | __all__ = ("PropagatingThread", "TicToc", "decide_cuda", "check_random_generator") 10 | 11 | 12 | def check_random_generator(seed): 13 | """Turn seed into a np.random.Generator instance 14 | 15 | Parameters 16 | ---------- 17 | seed : None | int | instance of Generator 18 | If seed is None, return the Generator singleton used by np.random. 19 | If seed is an int, return a new Generator instance seeded with seed. 20 | If seed is already a Generator instance, return it. 21 | Otherwise raise ValueError. 22 | """ 23 | if seed is None or seed is np.random: 24 | return np.random.default_rng() 25 | if isinstance(seed, numbers.Integral): 26 | return np.random.default_rng(seed) 27 | if isinstance(seed, np.random.Generator): 28 | return seed 29 | raise ValueError("%r cannot be used to seed a numpy.random.RandomState instance" % seed) 30 | -------------------------------------------------------------------------------- /falkon/utils/fake_queue.py: -------------------------------------------------------------------------------- 1 | class FakeQueue: 2 | def __init__(self): 3 | self.lst = [] 4 | 5 | def get(self): 6 | return self.lst.pop(0) 7 | 8 | def put(self, obj): 9 | self.lst.append(obj) 10 | 11 | def __len__(self): 12 | return len(self.lst) 13 | -------------------------------------------------------------------------------- /falkon/utils/stream_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | 5 | 6 | def sync_current_stream(device: Optional[torch.device] = None) -> None: 7 | # Sync current stream 8 | stream = torch.cuda.current_stream(device) 9 | stream.synchronize() 10 | 11 | 12 | def get_non_default_stream(device: Optional[torch.device] = None) -> torch.cuda.Stream: 13 | # Chooses the current stream if it's not the default stream. 14 | # If the current stream is the default stream, creates a new stream. 15 | stream = torch.cuda.current_stream(device) 16 | # noinspection PyProtectedMember 17 | if stream._as_parameter_ == torch.cuda.default_stream(device)._as_parameter_: 18 | stream = torch.cuda.Stream(device) 19 | return stream 20 | -------------------------------------------------------------------------------- /falkon/utils/switches.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | import torch 5 | 6 | from falkon.options import BaseOptions, KeopsOptions 7 | 8 | 9 | def decide_cuda(opt: Optional[BaseOptions] = None): 10 | if opt is None: 11 | opt = BaseOptions() 12 | 13 | if opt.use_cpu: 14 | return False 15 | 16 | def get_error_str(name, err): 17 | e_str = ( 18 | "Failed to initialize %s library; " 19 | "falling back to CPU. Set 'use_cpu' to " 20 | "True to avoid this warning." % (name) 21 | ) 22 | if err is not None: 23 | e_str += "\nError encountered was %s" % (err) 24 | return e_str 25 | 26 | if not torch.cuda.is_available(): 27 | warnings.warn(get_error_str("CUDA", None)) 28 | return False 29 | return True 30 | 31 | 32 | def decide_keops(opt: Optional[KeopsOptions] = None): 33 | if opt is None: 34 | opt = KeopsOptions() 35 | if opt.keops_active.lower() == "no": 36 | return False 37 | if opt.keops_active.lower() == "force": 38 | return True 39 | # If not 'no' or 'force' we can choose depending on whether keops works. 40 | if not hasattr(decide_keops, "keops_works"): 41 | try: 42 | import pykeops # noqa: F401 43 | 44 | # pykeops.clean_pykeops() # just in case old build files are still present 45 | # pykeops.test_torch_bindings() 46 | decide_keops.keops_works = True 47 | except (ImportError, ModuleNotFoundError): 48 | warnings.warn( 49 | "Failed to import PyKeops library; this might lead to " 50 | "slower matrix-vector products within FALKON. Please " 51 | "install PyKeops and check it works to suppress this warning." 52 | ) 53 | decide_keops.keops_works = False 54 | return decide_keops.keops_works 55 | -------------------------------------------------------------------------------- /falkon/utils/threading.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | 3 | __all__ = ("PropagatingThread",) 4 | 5 | 6 | class PropagatingThread(Thread): 7 | """Thread class which propagates exceptions to the main thread 8 | 9 | Copied from question: 10 | https://stackoverflow.com/questions/2829329/catch-a-threads-exception-in-the-caller-thread-in-python 11 | """ 12 | 13 | def run(self): 14 | self.exc = None 15 | try: 16 | self.ret = self._target(*self._args, **self._kwargs) 17 | except BaseException as e: 18 | self.exc = e 19 | 20 | def join(self, timeout=None): 21 | super().join(timeout=timeout) 22 | if self.exc: 23 | raise RuntimeError("Exception in thread %s" % self.name) from self.exc 24 | return self.ret 25 | -------------------------------------------------------------------------------- /falkon/utils/tictoc.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mpr 2 | import threading as thr 3 | import time 4 | from typing import List, Optional 5 | 6 | 7 | class Timer: 8 | def __init__(self, time_list: List[float]): 9 | self.times = time_list 10 | self.start_time: Optional[float] = None 11 | 12 | def __enter__(self): 13 | self.start_time = time.time() 14 | 15 | def __exit__(self, type, value, traceback): 16 | self.times.append(time.time() - self.start_time) 17 | 18 | 19 | class TicToc: 20 | __t_start = {} 21 | 22 | def __init__(self, title="", debug=True): 23 | self.title = title 24 | self.should_print = debug 25 | 26 | def tic(self, _print=False): 27 | mp_name = self.mp_name 28 | times = TicToc.__t_start.setdefault(mp_name, []) 29 | 30 | if _print and self.should_print: 31 | indent_level = len(times) 32 | indent_str = self._get_indent_str(indent_level) 33 | print(f"{indent_str}{mp_name}::[{self.title}]", flush=True) 34 | times.append(time.time()) 35 | 36 | def toc(self): 37 | mp_name = self.mp_name 38 | times = TicToc.__t_start[mp_name] 39 | 40 | t_elapsed = time.time() - times.pop() 41 | indent_level = len(times) 42 | indent_str = self._get_indent_str(indent_level) 43 | if self.should_print: 44 | print(f"{indent_str}{mp_name}::[{self.title}] complete in {t_elapsed:.3f}s", flush=True) 45 | 46 | def toc_val(self): 47 | mp_name = self.mp_name 48 | times = TicToc.__t_start.setdefault(mp_name, []) 49 | return time.time() - times.pop() 50 | 51 | @property 52 | def mp_name(self): 53 | return f"{mpr.current_process().name}.{thr.current_thread().name}" 54 | 55 | @staticmethod 56 | def _get_indent_str(level): 57 | return "--" * level 58 | 59 | def __enter__(self): 60 | self.tic(_print=True) 61 | 62 | def __exit__(self, type, value, traceback): 63 | self.toc() 64 | -------------------------------------------------------------------------------- /notebooks/CreateSmallHiggs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import datasets" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "higgs = datasets.HiggsDataset()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 4, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "X, Y = higgs.read_data(dtype=np.float64)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 32, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "np.random.seed(13)\n", 38 | "X_sm_tr = []\n", 39 | "Y_sm_tr = []\n", 40 | "X_sm_ts = []\n", 41 | "Y_sm_ts = []\n", 42 | "i = 0\n", 43 | "cls = 0\n", 44 | "while len(X_sm_tr) < 10_000:\n", 45 | " if Y[i] == cls:\n", 46 | " X_sm_tr.append(X[i])\n", 47 | " Y_sm_tr.append(Y[i])\n", 48 | " cls = 1 - cls\n", 49 | " i += 1\n", 50 | "while len(X_sm_ts) < 20_000:\n", 51 | " if Y[i] == cls:\n", 52 | " X_sm_ts.append(X[i])\n", 53 | " Y_sm_ts.append(Y[i])\n", 54 | " cls = 1 - cls\n", 55 | " i += 1\n", 56 | "X_sm_tr = np.stack(X_sm_tr, 0)\n", 57 | "Y_sm_tr = np.stack(Y_sm_tr, 0).astype(np.int32)\n", 58 | "X_sm_ts = np.stack(X_sm_ts, 0)\n", 59 | "Y_sm_ts = np.stack(Y_sm_ts, 0).astype(np.int32)\n", 60 | "centers = X_sm_tr[:1000]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 35, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import h5py\n", 70 | "f = h5py.File('higgs_for_ho.hdf5', 'w')\n", 71 | "dset = f.create_dataset(\"X_train\", data=X_sm_tr)\n", 72 | "dset = f.create_dataset(\"Y_train\", data=Y_sm_tr)\n", 73 | "dset = f.create_dataset(\"X_test\", data=X_sm_ts)\n", 74 | "dset = f.create_dataset(\"Y_test\", data=Y_sm_ts)\n", 75 | "dset = f.create_dataset(\"centers\", data=centers)\n", 76 | "f.close()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 39, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "import numpy as np\n", 86 | "import h5py\n", 87 | "with h5py.File('higgs_for_ho.hdf5', 'r') as f:\n", 88 | " X_train = np.array(f['X_train'])\n", 89 | " Y_train = np.array(f['Y_train'])\n", 90 | " X_test = np.array(f['X_test'])\n", 91 | " Y_test = np.array(f['Y_test'])\n", 92 | " centers = np.array(f['centers'])" 93 | ] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.8.11" 113 | }, 114 | "toc": { 115 | "base_numbering": 1, 116 | "nav_menu": {}, 117 | "number_sections": true, 118 | "sideBar": true, 119 | "skip_h1_title": false, 120 | "title_cell": "Table of Contents", 121 | "title_sidebar": "Contents", 122 | "toc_cell": false, 123 | "toc_position": {}, 124 | "toc_section_display": true, 125 | "toc_window_display": false 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 4 130 | } 131 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools", 4 | "wheel", 5 | "ninja", 6 | "torch", 7 | ] 8 | build-backend = "setuptools.build_meta" 9 | 10 | [tool.pytest.ini_options] 11 | addopts = "-m 'not benchmark and not full'" 12 | testpaths = [ 13 | "falkon/tests", 14 | ] 15 | markers = [ 16 | # marks tests which are only used for timing purposes (deselect with '-m "not benchmark"') 17 | "benchmark", 18 | # tests which should only be run for very exhaustive testing. Not generally useful. 19 | "full", 20 | ] 21 | 22 | [tool.coverage.run] 23 | branch = true 24 | source = ["falkon"] 25 | omit = [ 26 | "falkon/tests/*", "falkon/hopt/*", "falkon/benchmarks/*", "falkon/csrc/*", 27 | ] 28 | 29 | [tool.black] 30 | line-length = 120 31 | 32 | [tool.ruff] 33 | target-version = "py38" 34 | ignore = [ 35 | "B028", # No explicit `stacklevel` keyword argument found 36 | "SIM108", 37 | "SIM116", # Disable Use a dictionary instead of consecutive `if` statements 38 | "SIM102", "SIM103", "SIM112", # flake8-simplify code styles 39 | "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason 40 | "SIM114", # Combine `if` branches using logical `or` operator 41 | "C408", # C408 ignored because we like the dict keyword argument syntax 42 | ] 43 | line-length = 120 44 | select = [ 45 | "B", 46 | "C4", 47 | "G", 48 | "E", 49 | "F", 50 | "SIM1", 51 | "W", 52 | # Not included in flake8 53 | "UP", 54 | "PERF", 55 | "PGH004", 56 | "PIE807", 57 | "PIE810", 58 | "PLE", 59 | "TRY302", 60 | ] 61 | --------------------------------------------------------------------------------