├── .coveragerc ├── .github └── workflows │ └── test.yml ├── .gitignore ├── .travis.yml ├── .zenodo.json ├── AUTHORS ├── LICENSE ├── MANIFEST.in ├── README.rst ├── ci ├── doc.sh ├── pylintrc └── test.sh ├── cloudbuild.yml ├── copt ├── __init__.py ├── constraint.py ├── data │ └── img1.csv ├── datasets.py ├── frank_wolfe.py ├── loss.py ├── penalty.py ├── proximal_gradient.py ├── randomized.py ├── splitting.py ├── tv_prox.py ├── utils.py └── utils_pytorch.py ├── doc ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── citing.rst ├── conf.py ├── index.rst ├── logo.png ├── loss_functions.rst ├── make.bat ├── paper │ ├── biblio.bib │ ├── index.tex │ └── jmlr2e.sty ├── solvers.rst ├── sphinx_ext │ └── github_link.py └── utils.rst ├── examples ├── README.txt ├── frank_wolfe │ ├── README.txt │ ├── plot_sfw.py │ ├── plot_sfw_real_data.py │ ├── plot_sparse_benchmark.py │ ├── plot_sparse_benchmark_pairwise.py │ └── plot_vertex_overlap.py ├── plot_accelerated.py ├── plot_group_lasso.py ├── plot_jax_copt.py ├── plot_saga_vs_svrg.py ├── proximal_splitting │ ├── README.txt │ ├── data │ │ └── blur_matrix.npz │ ├── plot_overlapping_group_lasso.py │ ├── plot_sparse_nuclear_norm.py │ └── plot_tv_deblurring.py └── pytorch │ ├── README.txt │ ├── adversarial_example.py │ └── adversarial_example_accuracies.py ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── setup.py └── tests ├── test_frank_wolfe.py ├── test_loss.py ├── test_matmul_speedup.py ├── test_penalties.py ├── test_proximal_gradient.py ├── test_randomized.py ├── test_splitting.py └── test_stochastic_fw.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = copt 4 | include = */copt/* 5 | 6 | [report] 7 | omit = */copt/datasets.py 8 | exclude_lines = 9 | if verbose 10 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test suite 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: ["3.8", "3.9", "3.10"] 13 | 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v1 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v1 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install . 25 | pip install pytest-parallel scikit-image coveralls coverage pytest-cov scikit-learn h5py Pillow pip-conflict-checker py 26 | - name: Check Dependencies 27 | run: | 28 | pipconflictchecker 29 | - name: Lint 30 | run: | 31 | flake8 --ignore N802,N806,W503 --select W504 `find . -name \*.py | grep -v setup.py | grep -v __init__.py | grep -v /doc/` 32 | - name: Test 33 | run: | 34 | pytest --cov-report term-missing --cov=copt -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | doc/_build 2 | dist/ 3 | doc/auto_examples 4 | doc/generated 5 | .coverage 6 | .vscode 7 | .mypy_cache/ 8 | copt.egg-info 9 | __pycache__ 10 | .pytest_cache/ 11 | doc/modules/ 12 | *code-workspace 13 | *.swp 14 | .DS_Store -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | install: 6 | - pip install --upgrade -r requirements.txt 7 | - # make sure tests run with these minimal requirements 8 | - python setup.py develop 9 | - # run _one_ test file, just to make sure everything imports fine 10 | - py.test -v copt tests/test_proximal_gradient.py 11 | - # the following are only needed for the examples and coverage tests 12 | - pip install pytest-parallel scikit-image coveralls coverage pytest-cov scikit-learn h5py Pillow 13 | - py.test --version 14 | script: 15 | - NUMBA_DISABLE_JIT=1 pytest -v --cov=copt --workers auto 16 | after_success: coveralls 17 | cache: 18 | directories: 19 | - $HOME/copt_data 20 | -------------------------------------------------------------------------------- /.zenodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "creators": [{ 3 | "name": "Fabian Pedregosa", 4 | "affiliation": "Google", 5 | "orcid": "0000-0003-4025-3953" 6 | }], 7 | "description": "

copt is a library for mathematical optimization written in Python.

", 8 | "access_right": "open", 9 | "license": "BSD-3-Clause", 10 | "upload_type": "software" 11 | } 12 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Fabian Pedregosa 2 | Google LLC -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2007–2018 Fabian Pedregosa. 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the Scikit-learn Developers nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | recursive-include doc * 3 | recursive-include tests *.py 4 | recursive-include examples * 5 | recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi *.tp 6 | recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz 7 | include COPYING 8 | include README.rst 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://travis-ci.org/openopt/copt.svg?branch=master 2 | :target: https://travis-ci.org/openopt/copt 3 | .. image:: https://coveralls.io/repos/github/openopt/copt/badge.svg?branch=master 4 | :target: https://coveralls.io/github/openopt/copt?branch=master 5 | .. image:: https://zenodo.org/badge/46262908.svg 6 | :target: https://zenodo.org/badge/latestdoi/46262908 7 | .. image:: https://storage.googleapis.com/copt-doc/doc_status.svg 8 | :target: http://openopt.github.io/copt/ 9 | .. image:: https://storage.googleapis.com/copt-doc/pylint.svg 10 | :target: https://storage.googleapis.com/copt-doc/pylint.txt 11 | 12 | 13 | Note: This package is no longer actively maintained. I won't be actively responding to issues. If you'd like to volunteer to maintain it, please drop me a line at f@bianp.net 14 | 15 | copt: composite optimization in Python 16 | ======================================= 17 | 18 | copt is an optimization library for Python. Its goal is to provide a high quality implementation of classical optimization algorithms under a consistent API. 19 | 20 | 21 | 22 | `Docs `_ | `Examples `_ 23 | 24 | 25 | 26 | 27 | Installation 28 | ============ 29 | 30 | If you already have a working installation of numpy and scipy, 31 | the easiest way to install copt is using ``pip`` :: 32 | 33 | pip install -U copt 34 | 35 | 36 | Alternatively, you can install the latest development from github with the command:: 37 | 38 | pip install git+https://github.com/openopt/copt.git 39 | 40 | 41 | Citing 42 | ====== 43 | 44 | If this software is useful for your research, please consider citing it as 45 | 46 | .. code:: 47 | 48 | @article{copt, 49 | author = {Fabian Pedregosa, Geoffrey Negiar, Gideon Dresdner}, 50 | title = {copt: composite optimization in Python}, 51 | year = 2020, 52 | DOI = {10.5281/zenodo.1283339}, 53 | url={http://openopt.github.io/copt/} 54 | } 55 | 56 | Development 57 | =========== 58 | 59 | The recommended way to work on the development versionis the following: 60 | 61 | 1. Clone locally the github repo. This can be done with the command:: 62 | 63 | git clone https://github.com/openopt/copt.git 64 | 65 | This will create a copt directory. 66 | 67 | 2. Link this directory to your Python interpreter. This can be done by 68 | running the following command from the copt directory created with the 69 | previous step:: 70 | 71 | python setup.py develop 72 | 73 | Now you can run the tests with :code:`py.test tests/` 74 | -------------------------------------------------------------------------------- /ci/doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | mkdir -p _build/html/ 4 | pip install -r requirements.txt 5 | pip install -U sphinx sphinx-gallery loky joblib sphinx_copybutton memory_profiler jax jaxlib anybadge numba 6 | # For pytorch examples 7 | pip install torch==1.7.0+cpu torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html 8 | pip install git+https://github.com/RobustBench/robustbench 9 | python setup.py install 10 | cd doc 11 | make html > doc_log.txt 12 | if [ $? -eq 0 ]; then 13 | # set up a badge depending on the result of the build 14 | echo "Building of documentation succeeded" 15 | anybadge --label=doc --value=passing --file=_build/html/doc_status.svg passing=green failing=red 16 | else 17 | echo "Building of documentation failed" 18 | anybadge --label=doc --value=failing --file=_build/html/doc_status.svg passing=green failing=red 19 | fi 20 | -------------------------------------------------------------------------------- /ci/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | pip install -r requirements.txt 4 | pip install pytest-parallel # run tests in parallel 5 | python setup.py install 6 | # py.test --workers auto 7 | 8 | # pylint 9 | pip install pylint anybadge 10 | pylint --rcfile=ci/pylintrc --output-format=text copt tests/*.py examples/*.py examples/*/*.py | tee pylint.txt 11 | score=$(sed -n 's/^Your code has been rated at \([-0-9.]*\)\/.*/\1/p' pylint.txt) 12 | echo "Pylint score was $score" 13 | anybadge --value=$score --file=pylint.svg pylint 14 | -------------------------------------------------------------------------------- /cloudbuild.yml: -------------------------------------------------------------------------------- 1 | steps: 2 | - name: 'python:3.6' 3 | id: Test 4 | entrypoint: /bin/sh 5 | args: 6 | - -c 7 | - 'ci/test.sh' 8 | timeout: 1800s # 30 min. 9 | - name: 'python:3.6' 10 | id: Doc 11 | entrypoint: /bin/sh 12 | args: 13 | - -c 14 | - 'ci/doc.sh' 15 | timeout: 50000s 16 | waitFor: ['-'] # The '-' indicates that this step begins immediately. 17 | - name: 'gcr.io/cloud-builders/gsutil' 18 | args: ['-m', 'cp', '-r', 'doc/_build/html/*', 'gs://openo.pt/copt/'] 19 | 20 | timeout: 50000s 21 | 22 | artifacts: 23 | objects: 24 | location: 'gs://openo.pt/copt/' 25 | paths: ['doc/_build/html/doc_status.svg', 'pylint.txt', 'pylint.svg'] 26 | -------------------------------------------------------------------------------- /copt/__init__.py: -------------------------------------------------------------------------------- 1 | """COPT: composite optimization in Python.""" 2 | __version__ = "0.9.1" # if you modify this, change it also in setup.py 3 | 4 | from . import datasets 5 | from . import tv_prox 6 | from . import utils 7 | from . import loss 8 | from . import constraint 9 | from .frank_wolfe import minimize_frank_wolfe 10 | from .proximal_gradient import minimize_proximal_gradient 11 | from .randomized import minimize_saga 12 | from .randomized import minimize_svrg 13 | from .randomized import minimize_vrtos 14 | from .randomized import minimize_sfw 15 | from .splitting import minimize_primal_dual 16 | from .splitting import minimize_three_split 17 | -------------------------------------------------------------------------------- /copt/constraint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import ma as ma 3 | from scipy import linalg 4 | from scipy.sparse import linalg as splinalg 5 | 6 | class LinfBall: 7 | """L-infinity ball. 8 | 9 | Args: 10 | alpha: float 11 | radius of the ball. 12 | """ 13 | p = np.inf 14 | 15 | def __init__(self, alpha): 16 | self.alpha = alpha 17 | 18 | def prox(self, x, step_size=None): 19 | """Projection onto the L-infinity ball. 20 | 21 | Args: 22 | x: array-like 23 | 24 | Returns: 25 | p : array-like, same shape as x 26 | projection of x onto the L-infinity ball. 27 | """ 28 | return x.clip(-self.alpha, self.alpha) 29 | 30 | 31 | class L2Ball: 32 | """L2 ball. 33 | 34 | Args: 35 | alpha: float 36 | radius of the ball. 37 | """ 38 | p = 2 39 | 40 | def __init__(self, alpha): 41 | self.alpha = alpha 42 | 43 | def prox(self, x, step_size=None): 44 | """Projection onto the L-2 ball. 45 | 46 | Args: 47 | x: array-like 48 | 49 | Returns: 50 | p : array-like, same shape as x 51 | projection of x onto the L-2 ball. 52 | """ 53 | 54 | norm = np.sqrt((x ** 2).sum()) 55 | if norm <= self.alpha: 56 | return x 57 | return self.alpha * x / norm 58 | 59 | 60 | class L1Ball: 61 | """Indicator function over the L1 ball 62 | 63 | This function is 0 if the sum of absolute values is less than or equal to 64 | alpha, and infinity otherwise. 65 | 66 | Args: 67 | alpha: float 68 | radius of the ball. 69 | """ 70 | p = 1 71 | 72 | def __init__(self, alpha): 73 | self.alpha = alpha 74 | 75 | def __call__(self, x): 76 | if np.abs(x).sum() <= self.alpha: 77 | return 0 78 | else: 79 | return np.inf 80 | 81 | def prox(self, x, step_size=None): 82 | """Projection onto the L-infinity ball. 83 | 84 | Parameters 85 | ---------- 86 | x: array-like 87 | 88 | Returns 89 | ------- 90 | p : array-like, same shape as x 91 | projection of x onto the L-infinity ball. 92 | """ 93 | return euclidean_proj_l1ball(x, self.alpha) 94 | 95 | def lmo(self, u, x, active_set=None): 96 | """Linear Minimization Oracle. 97 | 98 | Return s - x with s solving the linear problem 99 | max_{||s||_1 <= alpha} 100 | 101 | Args: 102 | u: array-like 103 | usually -gradient 104 | x: array-like 105 | usually the iterate of the considered algorithm 106 | active_set: no effect here. 107 | 108 | Returns: 109 | update_direction: array, 110 | s - x, where s is the vertex of the constraint most correlated 111 | with u 112 | fw_vertex_rep: (float, int) 113 | a hashable representation of s, for active set management 114 | None: not used here 115 | max_step_size: float 116 | 1. for a Frank-Wolfe step. 117 | """ 118 | abs_u = np.abs(u) 119 | largest_coordinate = np.argmax(abs_u) 120 | sign = np.sign(u[largest_coordinate]) 121 | 122 | update_direction = -x.copy() 123 | update_direction[largest_coordinate] += self.alpha * sign 124 | 125 | # Only useful for active_set management in pairwise FW 126 | fw_vertex_rep = (sign, largest_coordinate) 127 | max_step_size = 1. 128 | return update_direction, fw_vertex_rep, None, max_step_size 129 | 130 | def lmo_pairwise(self, u, x, active_set): 131 | """Pairwise Linear Minimization Oracle. 132 | 133 | Return s - v with s solving the linear problem 134 | max_{||s||_1 <= alpha} 135 | and v solving the linear problem 136 | min_{v \in active_set} 137 | 138 | Args: 139 | u: array, 140 | usually -gradient 141 | x: array, 142 | usually the iterate of the considered algorithm 143 | active_set: used to compute v 144 | 145 | Returns: 146 | update_direction: array 147 | s - v, where s is the vertex of the constraint most correlated with u 148 | and v is the vertex of the active set least correlated with u 149 | fw_vertex_rep: (float, int) 150 | a hashable representation of s, for active set management 151 | away_vertex_rep: (float, int) 152 | a hashable representation of v, for active set management 153 | max_step_size: float 154 | max_step_size to not move out of the constraint. Given by active_set[away_vertex_rep]. 155 | """ 156 | update_direction, fw_vertex_rep, _, _ = self.lmo(u, x) 157 | update_direction += x 158 | 159 | def _correlation(vertex_rep, u): 160 | """Compute the correlation between vertex represented by vertex_rep and vector u.""" 161 | sign, idx = vertex_rep 162 | return sign * u[idx] 163 | 164 | away_vertex_rep, max_step_size = min(active_set.items(), 165 | key=lambda item: _correlation(item[0], u)) 166 | 167 | sign, idx = away_vertex_rep 168 | update_direction[idx] -= sign * self.alpha 169 | return update_direction, fw_vertex_rep, away_vertex_rep, max_step_size 170 | 171 | 172 | class SimplexConstraint: 173 | def __init__(self, s=1): 174 | self.s = s 175 | 176 | def prox(self, x, step_size): 177 | return euclidean_proj_simplex(x, self.s) 178 | 179 | def lmo(self, u, x): 180 | """Return v - x, s solving the linear problem 181 | max_{||v||_1 <= s, v >= 0} 182 | """ 183 | largest_coordinate = np.argmax(u) 184 | 185 | update_direction = -x.copy() 186 | update_direction[largest_coordinate] += self.s * np.sign( 187 | u[largest_coordinate] 188 | ) 189 | 190 | return update_direction, int(largest_coordinate), None, 1 191 | 192 | def euclidean_proj_simplex(v, s=1.0): 193 | r""" Compute the Euclidean projection on a positive simplex 194 | 195 | Solves the optimization problem (using the algorithm from [1]): 196 | min_w 0.5 * || w - v ||_2^2 , s.t. \sum_i w_i = s, w_i >= 0 197 | 198 | Args: 199 | v: (n,) numpy array, 200 | n-dimensional vector to project 201 | s: float, optional, default: 1, 202 | radius of the simplex 203 | 204 | Returns: 205 | w: (n,) numpy array, 206 | Euclidean projection of v on the simplex 207 | 208 | Notes: 209 | The complexity of this algorithm is in O(n log(n)) as it involves sorting v. 210 | Better alternatives exist for high-dimensional sparse vectors (cf. [1]) 211 | However, this implementation still easily scales to millions of dimensions. 212 | 213 | References: 214 | [1] Efficient Projections onto the .1-Ball for Learning in High Dimensions 215 | John Duchi, Shai Shalev-Shwartz, Yoram Singer, and Tushar Chandra. 216 | International Conference on Machine Learning (ICML 2008) 217 | http://www.cs.berkeley.edu/~jduchi/projects/DuchiSiShCh08.pdf 218 | """ 219 | assert s > 0, "Radius s must be strictly positive (%d <= 0)" % s 220 | (n,) = v.shape # will raise ValueError if v is not 1-D 221 | # check if we are already on the simplex 222 | if v.sum() == s and np.alltrue(v >= 0): 223 | # best projection: itself! 224 | return v 225 | # get the array of cumulative sums of a sorted (decreasing) copy of v 226 | u = np.sort(v)[::-1] 227 | cssv = np.cumsum(u) 228 | # get the number of > 0 components of the optimal solution 229 | rho = np.nonzero(u * np.arange(1, n + 1) > (cssv - s))[0][-1] 230 | # compute the Lagrange multiplier associated to the simplex constraint 231 | theta = (cssv[rho] - s) / (rho + 1.0) 232 | # compute the projection by thresholding v using theta 233 | w = (v - theta).clip(min=0) 234 | return w 235 | 236 | 237 | def euclidean_proj_l1ball(v, s=1): 238 | """ Compute the Euclidean projection on a L1-ball 239 | 240 | Solves the optimisation problem (using the algorithm from [1]): 241 | min_w 0.5 * || w - v ||_2^2 , s.t. || w ||_1 <= s 242 | 243 | Args: 244 | v: (n,) numpy array, 245 | n-dimensional vector to project 246 | s: float, optional, default: 1, 247 | radius of the L1-ball 248 | 249 | Returns: 250 | w: (n,) numpy array, 251 | Euclidean projection of v on the L1-ball of radius s 252 | 253 | Notes: 254 | Solves the problem by a reduction to the positive simplex case 255 | See also :ref:`euclidean_proj_simplex` 256 | """ 257 | assert s > 0, "Radius s must be strictly positive (%d <= 0)" % s 258 | if len(v.shape) > 1: 259 | raise ValueError 260 | # compute the vector of absolute values 261 | u = np.abs(v) 262 | # check if v is already a solution 263 | if u.sum() <= s: 264 | # L1-norm is <= s 265 | return v 266 | # v is not already a solution: optimum lies on the boundary (norm == s) 267 | # project *u* on the simplex 268 | w = euclidean_proj_simplex(u, s=s) 269 | # compute the solution to the original problem on v 270 | w *= np.sign(v) 271 | return w 272 | 273 | 274 | class TraceBall: 275 | """Projection onto the trace (aka nuclear) norm, sum of singular values 276 | 277 | Args: 278 | alpha: float 279 | radius of the ball. 280 | 281 | """ 282 | 283 | is_separable = False 284 | 285 | def __init__(self, alpha, shape): 286 | assert len(shape) == 2 287 | self.shape = shape 288 | self.alpha = alpha 289 | 290 | def __call__(self, x): 291 | X = x.reshape(self.shape) 292 | if linalg.svdvals(X).sum() <= self.alpha + np.finfo(np.float32).eps: 293 | return 0 294 | else: 295 | return np.inf 296 | 297 | def prox(self, x, step_size): 298 | X = x.reshape(self.shape) 299 | U, s, Vt = linalg.svd(X, full_matrices=False) 300 | s_threshold = euclidean_proj_l1ball(s, self.alpha) 301 | return (U * s_threshold).dot(Vt).ravel() 302 | 303 | def prox_factory(self): 304 | raise NotImplementedError 305 | 306 | def lmo(self, u, x, active_set=None): 307 | """Linear Minimization Oracle. 308 | 309 | Return s - x with s solving the linear problem 310 | max_{||s||_nuc <= alpha} 311 | 312 | Args: 313 | u: usually -gradient 314 | x: usually the iterate of the considered algorithm 315 | active_set: no effect here. 316 | 317 | Returns: 318 | update_direction: s - x, where s is the vertex of the constraint most correlated with u 319 | None: not used here 320 | None: not used here 321 | max_step_size: 1. for a Frank-Wolfe step. 322 | """ 323 | u_mat = u.reshape(self.shape) 324 | ut, _, vt = splinalg.svds(u_mat, k=1) 325 | vertex = self.alpha * np.outer(ut, vt).ravel() 326 | return vertex - x, None, None, 1. 327 | -------------------------------------------------------------------------------- /copt/frank_wolfe.py: -------------------------------------------------------------------------------- 1 | """Frank-Wolfe and related algorithms.""" 2 | import warnings 3 | from collections import defaultdict 4 | import numpy as np 5 | from scipy import linalg 6 | from scipy import optimize 7 | from copt import utils 8 | 9 | 10 | EPS = np.finfo(np.float32).eps 11 | 12 | 13 | def backtracking_step_size( 14 | x, 15 | f_t, 16 | old_f_t, 17 | f_grad, 18 | certificate, 19 | lipschitz_t, 20 | max_step_size, 21 | update_direction, 22 | norm_update_direction, 23 | ): 24 | """Backtracking step-size finding routine for FW-like algorithms 25 | 26 | Args: 27 | x: array-like, shape (n_features,) 28 | Current iterate 29 | 30 | f_t: float 31 | Value of objective function at the current iterate. 32 | 33 | old_f_t: float 34 | Value of objective function at previous iterate. 35 | 36 | f_grad: callable 37 | Callable returning objective function and gradient at 38 | argument. 39 | 40 | certificate: float 41 | FW gap 42 | 43 | lipschitz_t: float 44 | Current value of the Lipschitz estimate. 45 | 46 | max_step_size: float 47 | Maximum admissible step-size. 48 | 49 | update_direction: array-like, shape (n_features,) 50 | Update direction given by the FW variant. 51 | 52 | norm_update_direction: float 53 | Squared L2 norm of update_direction 54 | 55 | Returns: 56 | step_size_t: float 57 | Step-size to be used to compute the next iterate. 58 | 59 | lipschitz_t: float 60 | Updated value for the Lipschitz estimate. 61 | 62 | f_next: float 63 | Objective function evaluated at x + step_size_t d_t. 64 | 65 | grad_next: array-like 66 | Gradient evaluated at x + step_size_t d_t. 67 | """ 68 | ratio_decrease = 0.9 69 | ratio_increase = 2.0 70 | max_ls_iter = 100 71 | if old_f_t is not None: 72 | tmp = (certificate ** 2) / (2 * (old_f_t - f_t) * norm_update_direction) 73 | lipschitz_t = max(min(tmp, lipschitz_t), lipschitz_t * ratio_decrease) 74 | for _ in range(max_ls_iter): 75 | step_size_t = certificate / (norm_update_direction * lipschitz_t) 76 | if step_size_t < max_step_size: 77 | rhs = -0.5 * step_size_t * certificate 78 | else: 79 | step_size_t = max_step_size 80 | rhs = ( 81 | -step_size_t * certificate 82 | + 0.5 * (step_size_t ** 2) * lipschitz_t * norm_update_direction 83 | ) 84 | f_next, grad_next = f_grad(x + step_size_t * update_direction) 85 | if f_next - f_t <= rhs + EPS: 86 | # .. sufficient decrease condition verified .. 87 | break 88 | else: 89 | lipschitz_t *= ratio_increase 90 | else: 91 | warnings.warn( 92 | "Exhausted line search iterations in minimize_frank_wolfe", RuntimeWarning 93 | ) 94 | return step_size_t, lipschitz_t, f_next, grad_next 95 | 96 | 97 | def update_active_set(active_set, 98 | fw_vertex_rep, away_vertex_rep, 99 | step_size): 100 | 101 | max_step_size = active_set[away_vertex_rep] 102 | active_set[fw_vertex_rep] += step_size 103 | active_set[away_vertex_rep] -= step_size 104 | 105 | if active_set[away_vertex_rep] == 0.: 106 | # drop step: remove vertex from active set 107 | del active_set[away_vertex_rep] 108 | if active_set[away_vertex_rep] < 0.: 109 | raise ValueError(f"The step size used is too large. " 110 | f"{step_size: .3f} vs. {max_step_size:.3f}") 111 | 112 | return active_set 113 | 114 | 115 | def minimize_frank_wolfe( 116 | fun, 117 | x0, 118 | lmo, 119 | x0_rep=None, 120 | variant='vanilla', 121 | jac="2-point", 122 | step="backtracking", 123 | lipschitz=None, 124 | args=(), 125 | max_iter=400, 126 | tol=1e-12, 127 | callback=None, 128 | verbose=0, 129 | eps=1e-8, 130 | ): 131 | r"""Frank-Wolfe algorithm. 132 | 133 | Implements the Frank-Wolfe algorithm, see , see :ref:`frank_wolfe` for 134 | a more detailed description. 135 | 136 | Args: 137 | fun : callable 138 | The objective function to be minimized. 139 | ``fun(x, *args) -> float`` 140 | where x is an 1-D array with shape (n,) and `args` 141 | is a tuple of the fixed parameters needed to completely 142 | specify the function. 143 | 144 | x0: array-like 145 | Initial guess for solution. 146 | 147 | lmo: callable 148 | Takes as input a vector u of same size as x0 and returns both the update 149 | direction and the maximum admissible step-size. 150 | 151 | x0_rep: immutable 152 | Is used to initialize the active set when variant == 'pairwise'. 153 | 154 | variant: {'vanilla, 'pairwise'} 155 | Determines which Frank-Wolfe variant to use, along with lmo. 156 | Pairwise sets up and updates an active set of vertices. 157 | This is needed to make sure to not move out of the constraint set 158 | when using a pairwise LMO. 159 | 160 | jac : {callable, '2-point', bool}, optional 161 | Method for computing the gradient vector. If it is a callable, 162 | it should be a function that returns the gradient vector: 163 | ``jac(x, *args) -> array_like, shape (n,)`` 164 | where x is an array with shape (n,) and `args` is a tuple with 165 | the fixed parameters. Alternatively, the '2-point' select a finite 166 | difference scheme for numerical estimation of the gradient. 167 | If `jac` is a Boolean and is True, `fun` is assumed to return the 168 | gradient along with the objective function. If False, the gradient 169 | will be estimated using '2-point' finite difference estimation. 170 | 171 | step: str or callable, optional 172 | Step-size strategy to use. Should be one of 173 | 174 | - "backtracking", will use the backtracking line-search from [PANJ2020]_ 175 | 176 | - "DR", will use the Demyanov-Rubinov step-size. This step-size minimizes a quadratic upper bound ob the objective using the gradient's lipschitz constant, passed in keyword argument `lipschitz`. [P2018]_ 177 | 178 | - "sublinear", will use a decreasing step-size of the form 2/(k+2). [J2013]_ 179 | 180 | - callable, if step is a callable function, it will use the step-size returned by step(locals). 181 | 182 | lipschitz: None or float, optional 183 | Estimate for the Lipschitz constant of the gradient. Required when step="DR". 184 | 185 | max_iter: integer, optional 186 | Maximum number of iterations. 187 | 188 | tol: float, optional 189 | Tolerance of the stopping criterion. The algorithm will stop whenever 190 | the Frank-Wolfe gap is below tol or the maximum number of iterations 191 | is exceeded. 192 | 193 | callback: callable, optional 194 | Callback to execute at each iteration. If the callable returns False 195 | then the algorithm with immediately return. 196 | 197 | eps: float or ndarray 198 | If jac is approximated, use this value for the step size. 199 | 200 | verbose: int, optional 201 | Verbosity level. 202 | 203 | 204 | Returns: 205 | scipy.optimize.OptimizeResult 206 | The optimization result represented as a 207 | ``scipy.optimize.OptimizeResult`` object. Important attributes are: 208 | ``x`` the solution array, ``success`` a Boolean flag indicating if 209 | the optimizer exited successfully and ``message`` which describes 210 | the cause of the termination. See `scipy.optimize.OptimizeResult` 211 | for a description of other attributes. 212 | 213 | 214 | References: 215 | 216 | .. [J2013] Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." `_ ICML 2013. 217 | 218 | .. [P2018] Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" `_, 2018 219 | 220 | .. [PANJ2020] Pedregosa, Fabian, Armin Askari, Geoffrey Negiar, and Martin Jaggi. `"Step-Size Adaptivity in Projection-Free Optimization." `_ arXiv:1806.05123 (2020). 221 | 222 | 223 | Examples: 224 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark.py` 225 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_vertex_overlap.py` 226 | """ 227 | x0 = np.asanyarray(x0, dtype=float) 228 | if tol < 0: 229 | raise ValueError("Tol must be non-negative") 230 | x = x0.copy() 231 | 232 | if variant == 'vanilla': 233 | active_set = None 234 | elif variant == 'pairwise': 235 | active_set = defaultdict(float) 236 | active_set[x0_rep] = 1. 237 | 238 | else: 239 | raise ValueError("Variant must be one of {'vanilla', 'pairwise'}.") 240 | 241 | lipschitz_t = None 242 | step_size = None 243 | if lipschitz is not None: 244 | lipschitz_t = lipschitz 245 | 246 | func_and_grad = utils.build_func_grad(jac, fun, args, eps) 247 | 248 | f_t, grad = func_and_grad(x) 249 | old_f_t = None 250 | 251 | for it in range(max_iter): 252 | update_direction, fw_vertex_rep, away_vertex_rep, max_step_size = lmo(-grad, x, active_set) 253 | norm_update_direction = linalg.norm(update_direction) ** 2 254 | certificate = np.dot(update_direction, -grad) 255 | 256 | # .. compute an initial estimate for the .. 257 | # .. Lipschitz estimate if not given ... 258 | if lipschitz_t is None: 259 | eps = 1e-3 260 | grad_eps = func_and_grad(x + eps * update_direction)[1] 261 | lipschitz_t = linalg.norm(grad - grad_eps) / ( 262 | eps * np.sqrt(norm_update_direction) 263 | ) 264 | print("Estimated L_t = %s" % lipschitz_t) 265 | 266 | if certificate <= tol: 267 | break 268 | if hasattr(step, "__call__"): 269 | step_size = step(locals()) 270 | f_next, grad_next = func_and_grad(x + step_size * update_direction) 271 | elif step == "backtracking": 272 | step_size, lipschitz_t, f_next, grad_next = backtracking_step_size( 273 | x, 274 | f_t, 275 | old_f_t, 276 | func_and_grad, 277 | certificate, 278 | lipschitz_t, 279 | max_step_size, 280 | update_direction, 281 | norm_update_direction, 282 | ) 283 | elif step == "DR": 284 | if lipschitz is None: 285 | raise ValueError('lipschitz needs to be specified with step="DR"') 286 | step_size = min( 287 | certificate / (norm_update_direction * lipschitz_t), max_step_size 288 | ) 289 | f_next, grad_next = func_and_grad(x + step_size * update_direction) 290 | elif step == "sublinear": 291 | # .. without knowledge of the Lipschitz constant .. 292 | # .. we take the sublinear 2/(k+2) step-size .. 293 | step_size = 2.0 / (it + 2) 294 | f_next, grad_next = func_and_grad(x + step_size * update_direction) 295 | else: 296 | raise ValueError("Invalid option step=%s" % step) 297 | if callback is not None: 298 | if callback(locals()) is False: # pylint: disable=g-bool-id-comparison 299 | break 300 | x += step_size * update_direction 301 | if variant == 'pairwise': 302 | update_active_set(active_set, fw_vertex_rep, away_vertex_rep, 303 | step_size) 304 | old_f_t = f_t 305 | f_t, grad = f_next, grad_next 306 | if callback is not None: 307 | callback(locals()) 308 | return optimize.OptimizeResult(x=x, nit=it, certificate=certificate, 309 | active_set=active_set) 310 | -------------------------------------------------------------------------------- /copt/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse, special 3 | from scipy.sparse import linalg as splinalg 4 | from sklearn.utils.extmath import safe_sparse_dot 5 | 6 | from copt.utils import safe_sparse_add, njit, prange 7 | 8 | 9 | class LogLoss: 10 | r"""Logistic loss function. 11 | 12 | The logistic loss function is defined as 13 | 14 | .. math:: 15 | -\frac{1}{n}\sum_{i=1}^n b_i \log(\sigma(\bs{a}_i^T \bs{x})) 16 | + (1 - b_i) \log(1 - \sigma(\bs{a}_i^T \bs{x})) 17 | 18 | where :math:`\sigma` is the sigmoid function 19 | :math:`\sigma(t) = 1/(1 + e^{-t})`. 20 | 21 | The input vector b verifies :math:`0 \leq b_i \leq 1`. When it comes from 22 | class labels, it should have the values 0 or 1. 23 | 24 | References: 25 | http://fa.bianp.net/blog/2019/evaluate_logistic/ 26 | """ 27 | 28 | def __init__(self, A, b, alpha=0.0): 29 | if A is None: 30 | A = sparse.eye(b.size, b.size, format="csr") 31 | self.A = A 32 | if np.max(b) > 1 or np.min(b) < 0: 33 | raise ValueError("b can only contain values between 0 and 1 ") 34 | if not A.shape[0] == b.size: 35 | raise ValueError("Dimensions of A and b do not coincide") 36 | self.b = b 37 | self.alpha = alpha 38 | self.intercept = False 39 | 40 | def __call__(self, x): 41 | return self.f_grad(x, return_gradient=False) 42 | 43 | def _sigma(self, z, idx): 44 | z0 = np.zeros_like(z) 45 | tmp = np.exp(-z[idx]) 46 | z0[idx] = 1 / (1 + tmp) 47 | tmp = np.exp(z[~idx]) 48 | z0[~idx] = tmp / (1 + tmp) 49 | return z0 50 | 51 | def logsig(self, x): 52 | """Compute log(1 / (1 + exp(-t))) component-wise.""" 53 | out = np.zeros_like(x) 54 | idx0 = x < -33 55 | out[idx0] = x[idx0] 56 | idx1 = (x >= -33) & (x < -18) 57 | out[idx1] = x[idx1] - np.exp(x[idx1]) 58 | idx2 = (x >= -18) & (x < 37) 59 | out[idx2] = -np.log1p(np.exp(-x[idx2])) 60 | idx3 = x >= 37 61 | out[idx3] = -np.exp(-x[idx3]) 62 | return out 63 | 64 | def expit_b(self, x, b): 65 | """Compute sigmoid(x) - b.""" 66 | idx = x < 0 67 | out = np.zeros_like(x) 68 | exp_x = np.exp(x[idx]) 69 | b_idx = b[idx] 70 | out[idx] = ((1 - b_idx) * exp_x - b_idx) / (1 + exp_x) 71 | exp_nx = np.exp(-x[~idx]) 72 | b_nidx = b[~idx] 73 | out[~idx] = ((1 - b_nidx) - b_nidx * exp_nx) / (1 + exp_nx) 74 | return out 75 | 76 | def f_grad(self, x, return_gradient=True): 77 | if self.intercept: 78 | x_, c = x[:-1], x[-1] 79 | else: 80 | x_, c = x, 0.0 81 | z = safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c 82 | loss = np.mean((1 - self.b) * z - self.logsig(z)) 83 | penalty = safe_sparse_dot(x_.T, x_, dense_output=True).ravel()[0] 84 | loss += 0.5 * self.alpha * penalty 85 | 86 | if not return_gradient: 87 | return loss 88 | 89 | z0_b = self.expit_b(z, self.b) 90 | 91 | grad = safe_sparse_add(self.A.T.dot(z0_b) / self.A.shape[0], self.alpha * x_) 92 | grad = np.asarray(grad).ravel() 93 | grad_c = z0_b.mean() 94 | if self.intercept: 95 | return np.concatenate((grad, [grad_c])) 96 | 97 | return loss, grad 98 | 99 | def hessian_mv(self, x): 100 | """Return a callable that returns matrix-vector products with the Hessian.""" 101 | 102 | n_samples, n_features = self.A.shape 103 | if self.intercept: 104 | x_, c = x[:-1], x[-1] 105 | else: 106 | x_, c = x, 0.0 107 | 108 | z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c) 109 | 110 | # The mat-vec product of the Hessian 111 | d = z * (1 - z) 112 | if sparse.issparse(self.A): 113 | dX = safe_sparse_dot( 114 | sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A 115 | ) 116 | else: 117 | # Precompute as much as possible 118 | dX = d[:, np.newaxis] * self.A 119 | 120 | if self.intercept: 121 | # Calculate the double derivative with respect to intercept 122 | # In the case of sparse matrices this returns a matrix object. 123 | dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) 124 | 125 | def _Hs(s): 126 | ret = np.empty_like(s) 127 | ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features])) 128 | ret[:n_features] += self.alpha * s[:n_features] 129 | 130 | # For the fit intercept case. 131 | if self.intercept: 132 | ret[:n_features] += s[-1] * dd_intercept 133 | ret[-1] = dd_intercept.dot(s[:n_features]) 134 | ret[-1] += d.sum() * s[-1] 135 | return ret / n_samples 136 | 137 | return _Hs 138 | 139 | def hessian_trace(self, x): 140 | """Return a callable that returns matrix-vector products with the Hessian.""" 141 | 142 | n_samples, n_features = self.A.shape 143 | if self.intercept: 144 | x_, c = x[:-1], x[-1] 145 | else: 146 | x_, c = x, 0.0 147 | 148 | z = special.expit(safe_sparse_dot(self.A, x_, dense_output=True).ravel() + c) 149 | 150 | # The mat-vec product of the Hessian 151 | d = z * (1 - z) 152 | if sparse.issparse(self.A): 153 | dX = safe_sparse_dot( 154 | sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), self.A 155 | ) 156 | else: 157 | # Precompute as much as possible 158 | dX = d[:, np.newaxis] * self.A 159 | 160 | if self.intercept: 161 | # Calculate the double derivative with respect to intercept 162 | # In the case of sparse matrices this returns a matrix object. 163 | dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) 164 | 165 | def _Hs(s): 166 | ret = np.empty_like(s) 167 | ret[:n_features] = self.A.T.dot(dX.dot(s[:n_features])) 168 | ret[:n_features] += self.alpha * s[:n_features] 169 | 170 | # For the fit intercept case. 171 | if self.intercept: 172 | ret[:n_features] += s[-1] * dd_intercept 173 | ret[-1] = dd_intercept.dot(s[:n_features]) 174 | ret[-1] += d.sum() * s[-1] 175 | return ret / n_samples 176 | 177 | return _Hs 178 | 179 | @property 180 | def partial_deriv(self): 181 | """Note: this will ignore the regularization parameter alpha""" 182 | @njit(parallel=True) 183 | def log_deriv(p, y): 184 | # derivative of logistic loss 185 | # same as in lightning (with minus sign) 186 | out = np.zeros_like(p) 187 | for i in prange(p.size): 188 | if p[i] < 0: 189 | exp_p = np.exp(p[i]) 190 | out[i] = ((1 - y[i]) * exp_p - y[i]) / (1 + exp_p) 191 | else: 192 | exp_nx = np.exp(-p[i]) 193 | out[i] = ((1 - y[i]) - y[i] * exp_nx) / (1 + exp_nx) 194 | return out 195 | 196 | return log_deriv 197 | 198 | @property 199 | def lipschitz(self): 200 | s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0] 201 | return 0.25 * (s * s) / self.A.shape[0] + self.alpha 202 | 203 | @property 204 | def max_lipschitz(self): 205 | from sklearn.utils.extmath import row_norms 206 | 207 | max_squared_sum = row_norms(self.A, squared=True).max() 208 | 209 | return 0.25 * max_squared_sum + self.alpha 210 | 211 | 212 | class SquareLoss: 213 | r"""Squared loss. 214 | 215 | The Squared loss is defined as 216 | 217 | .. math:: 218 | \frac{1}{2n}\|A x - b\|^2 + \frac{1}{2} \alpha \|x\|^2 219 | 220 | where :math:`\|\cdot\|` is the euclidean norm. 221 | """ 222 | 223 | def __init__(self, A, b, alpha=0): 224 | if A is None: 225 | A = sparse.eye(b.size, b.size, format="csr") 226 | self.b = b 227 | self.alpha = alpha 228 | self.A = A 229 | self.name = "square" 230 | 231 | def __call__(self, x): 232 | z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b 233 | pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0] 234 | return 0.5 * (z * z).mean() + 0.5 * pen 235 | 236 | def f_grad(self, x, return_gradient=True): 237 | z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b 238 | pen = self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0] 239 | loss = 0.5 * (z * z).mean() + 0.5 * pen 240 | if not return_gradient: 241 | return loss 242 | grad = safe_sparse_add(self.A.T.dot(z) / self.A.shape[0], self.alpha * x.T) 243 | return loss, np.asarray(grad).ravel() 244 | 245 | @property 246 | def partial_deriv(self): 247 | @njit 248 | def square_deriv(p, y): 249 | return p - y 250 | return square_deriv 251 | 252 | @property 253 | def lipschitz(self): 254 | s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0] 255 | return (s * s) / self.A.shape[0] + self.alpha 256 | 257 | @property 258 | def max_lipschitz(self): 259 | from sklearn.utils.extmath import row_norms 260 | 261 | max_squared_sum = row_norms(self.A, squared=True).max() 262 | 263 | return max_squared_sum + self.alpha 264 | 265 | 266 | class HuberLoss: 267 | """Huber loss""" 268 | 269 | def __init__(self, A, b, alpha=0, delta=1): 270 | self.delta = delta 271 | self.A = A 272 | self.b = b 273 | self.alpha = alpha 274 | self.name = "huber" 275 | 276 | def __call__(self, x): 277 | return self.f_grad(x, return_gradient=False) 278 | 279 | def f_grad(self, x, return_gradient=True): 280 | z = safe_sparse_dot(self.A, x, dense_output=True).ravel() - self.b 281 | idx = np.abs(z) < self.delta 282 | loss = 0.5 * np.sum(z[idx] * z[idx]) 283 | loss += np.sum(self.delta * (np.abs(z[~idx]) - 0.5 * self.delta)) 284 | loss = ( 285 | loss / z.size 286 | + 0.5 * self.alpha * safe_sparse_dot(x.T, x, dense_output=True).ravel()[0] 287 | ) 288 | if not return_gradient: 289 | return loss 290 | grad = self.A[idx].T.dot(z[idx]) / self.A.shape[0] + self.alpha * x.T 291 | grad = np.asarray(grad) 292 | grad += self.A[~idx].T.dot(self.delta * np.sign(z[~idx])) / self.A.shape[0] 293 | return loss, np.asarray(grad).ravel() 294 | 295 | @property 296 | def lipschitz(self): 297 | s = splinalg.svds(self.A, k=1, return_singular_vectors=False)[0] 298 | return (s * s) / self.A.shape[0] + self.alpha 299 | -------------------------------------------------------------------------------- /copt/penalty.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse, linalg 3 | 4 | from copt.utils import njit 5 | 6 | 7 | class L1Norm: 8 | """L1 norm, that is, the sum of absolute values: 9 | 10 | .. math:: 11 | \\alpha\\sum_i^d |x_i| 12 | 13 | Args: 14 | alpha: float 15 | constant multiplying the L1 norm 16 | 17 | """ 18 | 19 | def __init__(self, alpha): 20 | self.alpha = alpha 21 | 22 | def __call__(self, x): 23 | return self.alpha * np.abs(x).sum() 24 | 25 | def prox(self, x, step_size): 26 | """Proximal operator of the L1 norm. 27 | 28 | This routine can be used in gradient-based methods like 29 | minimize_proximal_gradient, minimize_three_split and 30 | minimize_primal_dual. 31 | """ 32 | return np.fmax(x - self.alpha * step_size, 0) - np.fmax( 33 | -x - self.alpha * step_size, 0 34 | ) 35 | 36 | def prox_factory(self, n_features): 37 | """Proximal operator of the L1 norm. 38 | 39 | This method is meant to be used with stochastic algorithms that need 40 | access to a proximal operator over a potentially sparse vector, 41 | like minimize_saga, minimize_svrg and minimize_vrtos 42 | """ 43 | alpha = self.alpha 44 | 45 | @njit 46 | def _prox_L1(x, i, indices, indptr, d, step_size): 47 | for j in range(indptr[i], indptr[i + 1]): 48 | j_idx = indices[j] # for L1 this is the same 49 | a = x[j_idx] - alpha * d[j_idx] * step_size 50 | b = -x[j_idx] - alpha * d[j_idx] * step_size 51 | x[j_idx] = np.fmax(a, 0) - np.fmax(b, 0) 52 | 53 | return _prox_L1, sparse.eye(n_features, format="csr") 54 | 55 | 56 | class GroupL1: 57 | """ 58 | Group Lasso penalty 59 | 60 | Args: 61 | alpha: float 62 | Constant multiplying this loss 63 | 64 | blocks: list of lists 65 | 66 | """ 67 | 68 | def __init__(self, alpha, groups): 69 | self.alpha = alpha 70 | # groups need to be increasing 71 | for i, g in enumerate(groups): 72 | if not np.all(np.diff(g) == 1): 73 | raise ValueError("Groups must be contiguous") 74 | if i > 0 and groups[i - 1][-1] >= g[0]: 75 | raise ValueError("Groups must be increasing") 76 | self.groups = groups 77 | 78 | def __call__(self, x): 79 | return self.alpha * np.sum([np.linalg.norm(x[g]) for g in self.groups]) 80 | 81 | def prox(self, x, step_size): 82 | out = x.copy() 83 | for g in self.groups: 84 | 85 | norm = np.linalg.norm(x[g]) 86 | if norm > self.alpha * step_size: 87 | out[g] -= step_size * self.alpha * out[g] / norm 88 | else: 89 | out[g] = 0 90 | return out 91 | 92 | def prox_factory(self, n_features): 93 | B_data = np.zeros(n_features) 94 | B_indices = np.arange(n_features, dtype=np.int32) 95 | B_indptr = np.zeros(n_features + 1, dtype=np.int32) 96 | 97 | feature_pointer = 0 98 | block_pointer = 0 99 | for g in self.groups: 100 | while feature_pointer < g[0]: 101 | # non-penalized feature 102 | B_data[feature_pointer] = -1.0 103 | B_indptr[block_pointer + 1] = B_indptr[block_pointer] + 1 104 | feature_pointer += 1 105 | block_pointer += 1 106 | B_indptr[block_pointer + 1] = B_indptr[block_pointer] 107 | for _ in g: 108 | B_data[feature_pointer] = 1.0 109 | B_indptr[block_pointer + 1] += 1 110 | feature_pointer += 1 111 | block_pointer += 1 112 | for _ in range(feature_pointer, n_features): 113 | B_data[feature_pointer] = -1.0 114 | B_indptr[block_pointer + 1] = B_indptr[block_pointer] + 1 115 | feature_pointer += 1 116 | block_pointer += 1 117 | 118 | B_indptr = B_indptr[: block_pointer + 1] 119 | B = sparse.csr_matrix((B_data, B_indices, B_indptr)) 120 | alpha = self.alpha 121 | 122 | @njit 123 | def _prox_gl(x, i, indices, indptr, d, step_size): 124 | for b in range(indptr[i], indptr[i + 1]): 125 | h = indices[b] 126 | if B_data[B_indices[B_indptr[h]]] <= 0: 127 | continue 128 | ss = step_size * d[h] 129 | norm = 0.0 130 | for j in range(B_indptr[h], B_indptr[h + 1]): 131 | j_idx = B_indices[j] 132 | norm += x[j_idx] ** 2 133 | norm = np.sqrt(norm) 134 | if norm > alpha * ss: 135 | for j in range(B_indptr[h], B_indptr[h + 1]): 136 | j_idx = B_indices[j] 137 | x[j_idx] *= 1 - alpha * ss / norm 138 | else: 139 | for j in range(B_indptr[h], B_indptr[h + 1]): 140 | j_idx = B_indices[j] 141 | x[j_idx] = 0.0 142 | 143 | return _prox_gl, B 144 | 145 | 146 | class FusedLasso: 147 | """ 148 | Fused Lasso penalty 149 | 150 | Args: 151 | alpha: float 152 | Constant multiplying this function. 153 | """ 154 | 155 | def __init__(self, alpha): 156 | self.alpha = alpha 157 | 158 | def __call__(self, x): 159 | return self.alpha * np.sum(np.abs(np.diff(x))) 160 | 161 | def prox(self, x, step_size): 162 | # imported here to avoid circular imports 163 | from copt import tv_prox 164 | 165 | return tv_prox.prox_tv1d(x, step_size * self.alpha) 166 | 167 | def prox_1_factory(self, n_features): 168 | B_1_data = np.ones(n_features) 169 | B_1_indices = np.arange(n_features, dtype=np.int32) 170 | B_1_indptr = np.arange(0, n_features + 1, 2, dtype=np.int32) 171 | if n_features % 2 == 1: 172 | B_1_indptr = np.concatenate((B_1_indptr, [B_1_indptr[-1] + 1])) 173 | B_1_data[-1] = -1 174 | n_blocks = (n_features + 1) // 2 175 | B_1 = sparse.csr_matrix( 176 | (B_1_data, B_1_indices, B_1_indptr), shape=(n_blocks, n_features) 177 | ) 178 | alpha = self.alpha 179 | 180 | @njit 181 | def _prox_1_fl(x, i, indices, indptr, d, step_size): 182 | for b in range(indptr[i], indptr[i + 1]): 183 | h = indices[b] 184 | j_idx = B_1_indices[B_1_indptr[h]] 185 | if B_1_data[j_idx] <= 0: 186 | continue 187 | ss = step_size * d[h] * alpha 188 | if x[j_idx] - ss >= x[j_idx + 1] + ss: 189 | x[j_idx] -= ss 190 | x[j_idx + 1] += ss 191 | elif x[j_idx] + ss <= x[j_idx + 1] - ss: 192 | x[j_idx] += ss 193 | x[j_idx + 1] -= ss 194 | else: 195 | avg = (x[j_idx] + x[j_idx + 1]) / 2.0 196 | x[j_idx] = avg 197 | x[j_idx + 1] = avg 198 | 199 | return _prox_1_fl, B_1 200 | 201 | def prox_2_factory(self, n_features): 202 | B_2_data = np.ones(n_features) 203 | B_2_indices = np.arange(n_features, dtype=np.int32) 204 | _indptr = np.arange(1, n_features + 2, 2, dtype=np.int32) 205 | B_2_indptr = np.concatenate(([0], _indptr)) 206 | B_2_data[0] = -1 207 | if n_features % 2 == 0: 208 | B_2_indptr[-1] -= 1 209 | B_2_data[-1] = -1 210 | n_blocks = n_features // 2 + 1 211 | B_2 = sparse.csr_matrix( 212 | (B_2_data, B_2_indices, B_2_indptr), shape=(n_blocks, n_features) 213 | ) 214 | alpha = self.alpha 215 | 216 | @njit 217 | def _prox_2_fl(x, i, indices, indptr, d, step_size): 218 | for b in range(indptr[i], indptr[i + 1]): 219 | h = indices[b] 220 | j_idx = B_2_indices[B_2_indptr[h]] 221 | if B_2_data[j_idx] <= 0: 222 | continue 223 | ss = step_size * d[h] * alpha 224 | if x[j_idx] - ss >= x[j_idx + 1] + ss: 225 | x[j_idx] -= ss 226 | x[j_idx + 1] += ss 227 | elif x[j_idx] + ss <= x[j_idx + 1] - ss: 228 | x[j_idx] += ss 229 | x[j_idx + 1] -= ss 230 | else: 231 | avg = (x[j_idx] + x[j_idx + 1]) / 2.0 232 | x[j_idx] = avg 233 | x[j_idx + 1] = avg 234 | 235 | return _prox_2_fl, B_2 236 | 237 | 238 | class TraceNorm: 239 | """Trace (aka nuclear) norm, sum of singular values. 240 | 241 | Args: 242 | alpha: float 243 | Constant multiplying this function. 244 | shape: float 245 | Shape of original matrix, since input is given as 246 | a raveled vector. 247 | """ 248 | 249 | is_separable = False 250 | 251 | def __init__(self, alpha, shape): 252 | assert len(shape) == 2 253 | self.shape = shape 254 | self.alpha = alpha 255 | 256 | def __call__(self, x): 257 | X = x.reshape(self.shape) 258 | return self.alpha * linalg.svdvals(X).sum() 259 | 260 | def prox(self, x, step_size): 261 | X = x.reshape(self.shape) 262 | U, s, Vt = linalg.svd(X, full_matrices=False) 263 | s_threshold = np.fmax(s - self.alpha * step_size, 0) - np.fmax( 264 | -s - self.alpha * step_size, 0 265 | ) 266 | return (U * s_threshold).dot(Vt).ravel() 267 | 268 | def prox_factory(self): 269 | raise NotImplementedError 270 | 271 | 272 | class TotalVariation2D: 273 | """2-dimensional Total Variation pseudo-norm. 274 | 275 | Args: 276 | alpha: float 277 | Constant multiplying this function. 278 | shape: float 279 | Shape of original matrix, since input is given as 280 | a raveled vector. 281 | """ 282 | 283 | def __init__(self, alpha, shape, max_iter=100, tol=1e-6): 284 | self.alpha = alpha 285 | self.n_rows = shape[0] 286 | self.n_cols = shape[1] 287 | self.max_iter = max_iter 288 | self.tol = tol 289 | 290 | def __call__(self, x): 291 | img = x.reshape((self.n_rows, self.n_cols)) 292 | tmp1 = np.abs(np.diff(img, axis=0)) 293 | tmp2 = np.abs(np.diff(img, axis=1)) 294 | return self.alpha * (tmp1.sum() + tmp2.sum()) 295 | 296 | def prox(self, x, step_size): 297 | # here to avoid circular imports 298 | from copt import tv_prox 299 | 300 | return tv_prox.prox_tv2d( 301 | x, 302 | step_size * self.alpha, 303 | self.n_rows, 304 | self.n_cols, 305 | max_iter=self.max_iter, 306 | tol=self.tol, 307 | ) -------------------------------------------------------------------------------- /copt/proximal_gradient.py: -------------------------------------------------------------------------------- 1 | # python3 2 | """Proximal-gradient algorithms.""" 3 | import warnings 4 | import numpy as np 5 | from scipy import optimize 6 | from copt import utils 7 | 8 | 9 | def minimize_proximal_gradient( 10 | fun, 11 | x0, 12 | prox=None, 13 | jac="2-point", 14 | tol=1e-6, 15 | max_iter=500, 16 | args=(), 17 | verbose=0, 18 | callback=None, 19 | step="backtracking", 20 | accelerated=False, 21 | eps=1e-8, 22 | max_iter_backtracking=1000, 23 | backtracking_factor=0.6, 24 | trace_certificate=False, 25 | ): 26 | """Proximal gradient descent. 27 | 28 | Solves problems of the form 29 | 30 | minimize_x f(x) + g(x) 31 | 32 | where f is a differentiable function and we have access to the proximal 33 | operator of g. 34 | 35 | Args: 36 | fun : callable 37 | The objective function to be minimized. 38 | ``fun(x, *args) -> float`` 39 | where x is an 1-D array with shape (n,) and `args` 40 | is a tuple of the fixed parameters needed to completely 41 | specify the function. 42 | 43 | x0 : ndarray, shape (n,) 44 | Initial guess. Array of real elements of size (n,), 45 | where 'n' is the number of independent variables. 46 | 47 | jac : {callable, '2-point', bool}, optional 48 | Method for computing the gradient vector. If it is a callable, 49 | it should be a function that returns the gradient vector: 50 | ``jac(x, *args) -> array_like, shape (n,)`` 51 | where x is an array with shape (n,) and `args` is a tuple with 52 | the fixed parameters. Alternatively, the '2-point' select a finite 53 | difference scheme for numerical estimation of the gradient. 54 | If `jac` is a Boolean and is True, `fun` is assumed to return the 55 | gradient along with the objective function. If False, the gradient 56 | will be estimated using '2-point' finite difference estimation. 57 | 58 | prox : callable, optional. 59 | Proximal operator g. 60 | 61 | args : tuple, optional 62 | Extra arguments passed to the objective function and its 63 | derivatives. 64 | 65 | tol: float, optional 66 | Tolerance of the optimization procedure. The iteration stops when the gradient mapping 67 | (a generalization of the gradient to non-smooth functions) is below this tolerance. 68 | 69 | max_iter : int, optional. 70 | Maximum number of iterations. 71 | 72 | verbose : int, optional. 73 | Verbosity level, from 0 (no output) to 2 (output on each iteration) 74 | 75 | callback : callable. 76 | callback function (optional). Takes a single argument (x) with the 77 | current coefficients in the algorithm. The algorithm will exit if 78 | callback returns False. 79 | 80 | step : "backtracking" or callable. 81 | Step-size strategy to use. "backtracking" will use a backtracking line-search, 82 | while callable will use the value returned by step(locals()). 83 | 84 | accelerated: boolean 85 | Whether to use the accelerated variant of the algorithm. 86 | 87 | eps: float or ndarray 88 | If jac is approximated, use this value for the step size. 89 | 90 | max_iter_backtracking: int 91 | 92 | backtracking_factor: float 93 | 94 | trace_certificate: bool 95 | 96 | Returns: 97 | res : The optimization result represented as a 98 | ``scipy.optimize.OptimizeResult`` object. Important attributes are: 99 | ``x`` the solution array, ``success`` a Boolean flag indicating if 100 | the optimizer exited successfully and ``message`` which describes 101 | the cause of the termination. See `scipy.optimize.OptimizeResult` 102 | for a description of other attributes. 103 | 104 | References: 105 | Beck, Amir, and Marc Teboulle. "Gradient-based algorithms with applications 106 | to signal recovery." Convex optimization in signal processing and 107 | communications (2009) 108 | 109 | Examples: 110 | * :ref:`sphx_glr_auto_examples_plot_group_lasso.py` 111 | """ 112 | x = np.asarray(x0).flatten() 113 | if max_iter_backtracking <= 0: 114 | raise ValueError("Line search iterations need to be greater than 0") 115 | 116 | if prox is None: 117 | 118 | def _prox(x, _): 119 | return x 120 | 121 | prox = _prox 122 | 123 | success = False 124 | certificate = np.nan 125 | 126 | func_and_grad = utils.build_func_grad(jac, fun, args, eps) 127 | 128 | # find initial step-size 129 | if step == "backtracking": 130 | step_size = 1.8 / utils.init_lipschitz(func_and_grad, x0) 131 | else: 132 | # to avoid step_size being undefined upon return 133 | step_size = None 134 | 135 | n_iterations = 0 136 | certificate_list = [] 137 | # .. a while loop instead of a for loop .. 138 | # .. allows for infinite or floating point max_iter .. 139 | if not accelerated: 140 | fk, grad_fk = func_and_grad(x) 141 | while True: 142 | if callback is not None: 143 | if callback(locals()) is False: # pylint: disable=g-bool-id-comparison 144 | break 145 | # .. compute gradient and step size 146 | if hasattr(step, "__call__"): 147 | step_size = step(locals()) 148 | x_next = prox(x - step_size * grad_fk, step_size) 149 | update_direction = x_next - x 150 | f_next, grad_next = func_and_grad(x_next) 151 | elif step == "backtracking": 152 | x_next = prox(x - step_size * grad_fk, step_size) 153 | update_direction = x_next - x 154 | step_size *= 1.1 155 | for _ in range(max_iter_backtracking): 156 | f_next, grad_next = func_and_grad(x_next) 157 | rhs = ( 158 | fk 159 | + grad_fk.dot(update_direction) 160 | + update_direction.dot(update_direction) / (2.0 * step_size) 161 | ) 162 | if f_next <= rhs: 163 | # .. step size found .. 164 | break 165 | else: 166 | # .. backtracking, reduce step size .. 167 | step_size *= backtracking_factor 168 | x_next = prox(x - step_size * grad_fk, step_size) 169 | update_direction = x_next - x 170 | else: 171 | warnings.warn("Maxium number of line-search iterations reached") 172 | elif step == "fixed": 173 | x_next = prox(x - step_size * grad_fk, step_size) 174 | update_direction = x_next - x 175 | f_next, grad_next = func_and_grad(x_next) 176 | else: 177 | raise ValueError("Step-size strategy not understood") 178 | certificate = np.linalg.norm((x - x_next) / step_size) 179 | if trace_certificate: 180 | certificate_list.append(certificate) 181 | x[:] = x_next 182 | fk = f_next 183 | grad_fk = grad_next 184 | 185 | if certificate < tol: 186 | success = True 187 | break 188 | 189 | if n_iterations >= max_iter: 190 | break 191 | else: 192 | n_iterations += 1 193 | else: 194 | warnings.warn( 195 | "minimize_proximal_gradient did not reach the desired tolerance level", 196 | RuntimeWarning, 197 | ) 198 | else: 199 | tk = 1 200 | # .. a while loop instead of a for loop .. 201 | # .. allows for infinite or floating point max_iter .. 202 | yk = x.copy() 203 | while True: 204 | grad_fk = func_and_grad(yk)[1] 205 | if callback is not None: 206 | if callback(locals()) is False: # pylint: disable=g-bool-id-comparison 207 | break 208 | 209 | # .. compute gradient and step size 210 | if hasattr(step, "__call__"): 211 | current_step_size = step(locals()) 212 | x_next = prox(yk - current_step_size * grad_fk, current_step_size) 213 | t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 214 | yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) 215 | 216 | t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 217 | yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) 218 | 219 | x_prox = prox( 220 | x_next - current_step_size * func_and_grad(x_next)[1], 221 | current_step_size, 222 | ) 223 | certificate = np.linalg.norm((x - x_prox) / current_step_size) 224 | tk = t_next 225 | x = x_next.copy() 226 | 227 | elif step == "backtracking": 228 | current_step_size = step_size 229 | x_next = prox(yk - current_step_size * grad_fk, current_step_size) 230 | for _ in range(max_iter_backtracking): 231 | update_direction = x_next - yk 232 | if func_and_grad(x_next)[0] <= func_and_grad(yk)[0] + grad_fk.dot( 233 | update_direction 234 | ) + update_direction.dot(update_direction) / ( 235 | 2.0 * current_step_size 236 | ): 237 | # .. step size found .. 238 | break 239 | else: 240 | # .. backtracking, reduce step size .. 241 | current_step_size *= backtracking_factor 242 | x_next = prox( 243 | yk - current_step_size * grad_fk, current_step_size 244 | ) 245 | else: 246 | warnings.warn("Maxium number of line-search iterations reached") 247 | t_next = (1 + np.sqrt(1 + 4 * tk * tk)) / 2 248 | yk = x_next + ((tk - 1.0) / t_next) * (x_next - x) 249 | 250 | x_prox = prox( 251 | x_next - current_step_size * func_and_grad(x_next)[1], 252 | current_step_size, 253 | ) 254 | certificate = np.linalg.norm((x - x_prox) / current_step_size) 255 | if trace_certificate: 256 | certificate_list.append(certificate) 257 | tk = t_next 258 | x = x_next.copy() 259 | 260 | if certificate < tol: 261 | success = True 262 | break 263 | 264 | if n_iterations >= max_iter: 265 | break 266 | else: 267 | n_iterations += 1 268 | 269 | if n_iterations >= max_iter: 270 | warnings.warn( 271 | "minimize_proximal_gradient did not reach the desired tolerance level", 272 | RuntimeWarning, 273 | ) 274 | 275 | return optimize.OptimizeResult( 276 | x=x, 277 | success=success, 278 | certificate=certificate, 279 | nit=n_iterations, 280 | step_size=step_size, 281 | trace_certificate=certificate_list, 282 | ) 283 | 284 | -------------------------------------------------------------------------------- /copt/splitting.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | from scipy import optimize, linalg, sparse 4 | 5 | from . import utils 6 | 7 | 8 | def minimize_three_split( 9 | f_grad, 10 | x0, 11 | prox_1=None, 12 | prox_2=None, 13 | tol=1e-6, 14 | max_iter=1000, 15 | verbose=0, 16 | callback=None, 17 | line_search=True, 18 | step_size=None, 19 | max_iter_backtracking=100, 20 | backtracking_factor=0.7, 21 | h_Lipschitz=None, 22 | args_prox=(), 23 | ): 24 | """Davis-Yin three operator splitting method. 25 | 26 | This algorithm can solve problems of the form 27 | 28 | minimize_x f(x) + g(x) + h(x) 29 | 30 | where f is a smooth function and g and h are (possibly non-smooth) 31 | functions for which the proximal operator is known. 32 | 33 | Args: 34 | f_grad: callable 35 | Returns the function value and gradient of the objective function. 36 | With return_gradient=False, returns only the function value. 37 | 38 | x0 : array-like 39 | Initial guess 40 | 41 | prox_1 : callable or None, optional 42 | prox_1(x, alpha, *args) returns the proximal operator of g at xa 43 | with parameter alpha. 44 | 45 | prox_2 : callable or None, optional 46 | prox_2(x, alpha, *args) returns the proximal operator of g at xa 47 | with parameter alpha. 48 | 49 | tol: float, optional 50 | Tolerance of the stopping criterion. 51 | 52 | max_iter : int, optional 53 | Maximum number of iterations. 54 | 55 | verbose : int, optional 56 | Verbosity level, from 0 (no output) to 2 (output on each iteration) 57 | 58 | callback : callable, optional 59 | Callback function. Takes a single argument (x) with the 60 | current coefficients in the algorithm. The algorithm will exit if 61 | callback returns False. 62 | 63 | line_search : boolean, optional 64 | Whether to perform line-search to estimate the step size. 65 | 66 | step_size : float, optional 67 | Starting value for the line-search procedure. 68 | 69 | max_iter_backtracking : int, optional 70 | Maximun number of backtracking iterations. Used in line search. 71 | 72 | backtracking_factor : float, optional 73 | The amount to backtrack by during line search. 74 | 75 | args_prox : tuple, optional 76 | Optional Extra arguments passed to the prox functions. 77 | 78 | h_Lipschitz : float, optional 79 | If given, h is assumed to be Lipschitz continuous with constant h_Lipschitz. 80 | 81 | 82 | Returns: 83 | res : OptimizeResult 84 | The optimization result represented as a 85 | ``scipy.optimize.OptimizeResult`` object. Important attributes are: 86 | ``x`` the solution array, ``success`` a Boolean flag indicating if 87 | the optimizer exited successfully and ``message`` which describes 88 | the cause of the termination. See `scipy.optimize.OptimizeResult` 89 | for a description of other attributes. 90 | 91 | 92 | References: 93 | [1] Davis, Damek, and Wotao Yin. `"A three-operator splitting scheme and 94 | its optimization applications." 95 | `_ Set-Valued and Variational 96 | Analysis, 2017. 97 | 98 | [2] Pedregosa, Fabian, and Gauthier Gidel. `"Adaptive Three Operator 99 | Splitting." `_ Proceedings of the 35th 100 | International Conference on Machine Learning, 2018. 101 | """ 102 | success = False 103 | if not max_iter_backtracking > 0: 104 | raise ValueError("Line search iterations need to be greater than 0") 105 | 106 | if prox_1 is None: 107 | 108 | def prox_1(x, s, *args): 109 | return x 110 | 111 | if prox_2 is None: 112 | 113 | def prox_2(x, s, *args): 114 | return x 115 | 116 | if step_size is None: 117 | line_search = True 118 | step_size = 1.0 / utils.init_lipschitz(f_grad, x0) 119 | 120 | z = prox_2(x0, step_size, *args_prox) 121 | LS_EPS = np.finfo(float).eps 122 | 123 | fk, grad_fk = f_grad(z) 124 | x = prox_1(z - step_size * grad_fk, step_size, *args_prox) 125 | u = np.zeros_like(x) 126 | 127 | for it in range(max_iter): 128 | 129 | fk, grad_fk = f_grad(z) 130 | x = prox_1(z - step_size * (u + grad_fk), step_size, *args_prox) 131 | incr = x - z 132 | norm_incr = np.linalg.norm(incr) 133 | ls = norm_incr > 1e-7 and line_search 134 | if ls: 135 | for it_ls in range(max_iter_backtracking): 136 | x = prox_1(z - step_size * (u + grad_fk), step_size, *args_prox) 137 | incr = x - z 138 | norm_incr = np.linalg.norm(incr) 139 | rhs = fk + grad_fk.dot(incr) + (norm_incr ** 2) / (2 * step_size) 140 | ls_tol = f_grad(x, return_gradient=False) - rhs 141 | if ls_tol <= LS_EPS: 142 | # step size found 143 | # if ls_tol > 0: 144 | # ls_tol = 0. 145 | break 146 | else: 147 | step_size *= backtracking_factor 148 | 149 | z = prox_2(x + step_size * u, step_size, *args_prox) 150 | u += (x - z) / step_size 151 | certificate = norm_incr / step_size 152 | 153 | if ls and h_Lipschitz is not None: 154 | if h_Lipschitz == 0: 155 | step_size = step_size * 1.02 156 | else: 157 | quot = h_Lipschitz ** 2 158 | tmp = np.sqrt(step_size ** 2 + (2 * step_size / quot) * (-ls_tol)) 159 | step_size = min(tmp, step_size * 1.02) 160 | 161 | if callback is not None: 162 | if callback(locals()) is False: 163 | break 164 | 165 | if it > 0 and certificate < tol: 166 | success = True 167 | break 168 | 169 | return optimize.OptimizeResult( 170 | x=x, success=success, nit=it, certificate=certificate, step_size=step_size 171 | ) 172 | 173 | 174 | def minimize_primal_dual( 175 | f_grad, 176 | x0, 177 | prox_1=None, 178 | prox_2=None, 179 | L=None, 180 | tol=1e-12, 181 | max_iter=1000, 182 | callback=None, 183 | step_size=1.0, 184 | step_size2=None, 185 | line_search=True, 186 | max_iter_ls=20, 187 | verbose=0, 188 | ): 189 | """Primal-dual hybrid gradient splitting method. 190 | 191 | This method for optimization problems of the form 192 | 193 | minimize_x f(x) + g(x) + h(L x) 194 | 195 | where f is a smooth function and g is a (possibly non-smooth) 196 | function for which the proximal operator is known. 197 | 198 | Args: 199 | f_grad: callable 200 | Returns the function value and gradient of the objective function. 201 | It should accept the optional argument return_gradient, and when False 202 | it should return only the function value. 203 | 204 | prox_1 : callable of the form prox_1(x, alpha) 205 | prox_1(x, alpha, *args) returns the proximal operator of g at x 206 | with parameter alpha. 207 | 208 | prox_2 : callable or None 209 | prox_2(y, alpha, *args) returns the proximal operator of h at y 210 | with parameter alpha. 211 | 212 | x0 : array-like 213 | Initial guess of solution. 214 | 215 | L : array-like or linear operator 216 | Linear operator inside the h term. It may be any of the following types: 217 | - ndarray 218 | - matrix 219 | - sparse matrix (e.g. csr_matrix, lil_matrix, etc.) 220 | - LinearOperator 221 | - An object with .shape and .matvec attributes 222 | 223 | max_iter : int 224 | Maximum number of iterations. 225 | 226 | verbose : int 227 | Verbosity level, from 0 (no output) to 2 (output on each iteration) 228 | 229 | callback : callable. 230 | callback function (optional). Takes a single argument (x) with the 231 | current coefficients in the algorithm. The algorithm will exit if 232 | callback returns False. 233 | 234 | Returns: 235 | res : OptimizeResult 236 | The optimization result represented as a 237 | ``scipy.optimize.OptimizeResult`` object. Important attributes are: 238 | ``x`` the solution array, ``success`` a Boolean flag indicating if 239 | the optimizer exited successfully and ``message`` which describes 240 | the cause of the termination. See `scipy.optimize.OptimizeResult` 241 | for a description of other attributes. 242 | 243 | References: 244 | 245 | * Malitsky, Yura, and Thomas Pock. `A first-order primal-dual algorithm with linesearch `_, 246 | SIAM Journal on Optimization (2018) (Algorithm 4 for the line-search variant) 247 | 248 | * Condat, Laurent. "A primal-dual splitting method for convex optimization 249 | involving Lipschitzian, proximable and linear composite terms." Journal of 250 | Optimization Theory and Applications (2013). 251 | """ 252 | x = np.array(x0, copy=True) 253 | n_features = x.size 254 | 255 | if L is None: 256 | L = sparse.eye(n_features, n_features, format="csr") 257 | L = sparse.linalg.aslinearoperator(L) 258 | 259 | y = L.matvec(x) 260 | 261 | success = False 262 | if not max_iter_ls > 0: 263 | raise ValueError("Line search iterations need to be greater than 0") 264 | 265 | if prox_1 is None: 266 | 267 | def prox_1(x, step_size): 268 | return x 269 | 270 | if prox_2 is None: 271 | 272 | def prox_2(x, step_size): 273 | return x 274 | 275 | # conjugate of prox_2 276 | def prox_2_conj(x, ss): 277 | return x - ss * prox_2(x / ss, 1.0 / ss) 278 | 279 | # .. main iteration .. 280 | theta = 1.0 281 | delta = 0.5 282 | sigma = step_size 283 | if step_size2 is None: 284 | ss_ratio = 0.5 285 | tau = ss_ratio * sigma 286 | else: 287 | tau = step_size2 288 | ss_ratio = tau / sigma 289 | 290 | fk, grad_fk = f_grad(x) 291 | norm_incr = np.inf 292 | x_next = x.copy() 293 | 294 | for it in range(max_iter): 295 | y_next = prox_2_conj(y + tau * L.matvec(x), tau) 296 | if line_search: 297 | tau_next = tau * (1 + np.sqrt(1 + theta)) / 2 298 | while True: 299 | theta = tau_next / tau 300 | sigma = ss_ratio * tau_next 301 | y_bar = y_next + theta * (y_next - y) 302 | x_next = prox_1(x - sigma * (L.rmatvec(y_bar) + grad_fk), sigma) 303 | incr_x = np.linalg.norm(L.matvec(x_next) - L.matvec(x)) 304 | f_next, f_grad_next = f_grad(x_next) 305 | if incr_x <= 1e-10: 306 | break 307 | 308 | tmp = (sigma * tau_next) * (incr_x ** 2) 309 | tmp += 2 * sigma * (f_next - fk - grad_fk.dot(x_next - x)) 310 | if tmp / delta <= (incr_x ** 2): 311 | tau = tau_next 312 | break 313 | else: 314 | tau_next *= 0.9 315 | else: 316 | y_bar = 2 * y_next - y 317 | x_next = prox_1(x - sigma * (L.rmatvec(y_bar) + grad_fk), sigma) 318 | f_next, f_grad_next = f_grad(x_next) 319 | 320 | if it % 100 == 0: 321 | norm_incr = linalg.norm(x_next - x) + linalg.norm(y_next - y) 322 | 323 | x[:] = x_next[:] 324 | y[:] = y_next[:] 325 | fk, grad_fk = f_next, f_grad_next 326 | 327 | if norm_incr < tol: 328 | success = True 329 | break 330 | 331 | if callback is not None: 332 | if callback(locals()) is False: 333 | break 334 | 335 | if it >= max_iter: 336 | warnings.warn( 337 | "proximal_gradient did not reach the desired tolerance level", 338 | RuntimeWarning, 339 | ) 340 | 341 | return optimize.OptimizeResult( 342 | x=x, success=success, nit=it, certificate=norm_incr, step_size=sigma 343 | ) 344 | -------------------------------------------------------------------------------- /copt/tv_prox.py: -------------------------------------------------------------------------------- 1 | # Authors: Fabian Pedregosa. Code for total variation is based on the 2 | # code of Laurent Condat 3 | # 4 | 5 | """ 6 | These are implementations of some proximal operators 7 | """ 8 | 9 | import numpy as np 10 | import warnings 11 | from . import utils 12 | 13 | 14 | def prox_tv1d(w, step_size): 15 | """ 16 | Computes the proximal operator of the 1-dimensional total variation operator. 17 | 18 | This solves a problem of the form 19 | 20 | argmin_x TV(x) + (1/(2 stepsize)) ||x - w||^2 21 | 22 | where TV(x) is the one-dimensional total variation 23 | 24 | Parameters 25 | ---------- 26 | w: array 27 | vector of coefficients 28 | step_size: float 29 | step size (sometimes denoted gamma) in proximal objective function 30 | 31 | References 32 | ---------- 33 | Condat, Laurent. "A direct algorithm for 1D total variation denoising." 34 | IEEE Signal Processing Letters (2013) 35 | """ 36 | 37 | if w.dtype not in (np.float32, np.float64): 38 | raise ValueError("argument w must be array of floats") 39 | w = w.copy() 40 | output = np.empty_like(w) 41 | _prox_tv1d(step_size, w, output) 42 | return output 43 | 44 | 45 | @utils.njit 46 | def _prox_tv1d(step_size, input, output): 47 | """low level function call, no checks are performed""" 48 | width = input.size + 1 49 | index_low = np.zeros(width, dtype=np.int32) 50 | slope_low = np.zeros(width, dtype=input.dtype) 51 | index_up = np.zeros(width, dtype=np.int32) 52 | slope_up = np.zeros(width, dtype=input.dtype) 53 | index = np.zeros(width, dtype=np.int32) 54 | z = np.zeros(width, dtype=input.dtype) 55 | y_low = np.empty(width, dtype=input.dtype) 56 | y_up = np.empty(width, dtype=input.dtype) 57 | s_low, c_low, s_up, c_up, c = 0, 0, 0, 0, 0 58 | y_low[0] = y_up[0] = 0 59 | y_low[1] = input[0] - step_size 60 | y_up[1] = input[0] + step_size 61 | incr = 1 62 | 63 | for i in range(2, width): 64 | y_low[i] = y_low[i - 1] + input[(i - 1) * incr] 65 | y_up[i] = y_up[i - 1] + input[(i - 1) * incr] 66 | 67 | y_low[width - 1] += step_size 68 | y_up[width - 1] -= step_size 69 | slope_low[0] = np.inf 70 | slope_up[0] = -np.inf 71 | z[0] = y_low[0] 72 | 73 | for i in range(1, width): 74 | c_low += 1 75 | c_up += 1 76 | index_low[c_low] = index_up[c_up] = i 77 | slope_low[c_low] = y_low[i] - y_low[i - 1] 78 | while (c_low > s_low + 1) and ( 79 | slope_low[max(s_low, c_low - 1)] <= slope_low[c_low] 80 | ): 81 | c_low -= 1 82 | index_low[c_low] = i 83 | if c_low > s_low + 1: 84 | slope_low[c_low] = (y_low[i] - y_low[index_low[c_low - 1]]) / ( 85 | i - index_low[c_low - 1] 86 | ) 87 | else: 88 | slope_low[c_low] = (y_low[i] - z[c]) / (i - index[c]) 89 | 90 | slope_up[c_up] = y_up[i] - y_up[i - 1] 91 | while (c_up > s_up + 1) and (slope_up[max(c_up - 1, s_up)] >= slope_up[c_up]): 92 | c_up -= 1 93 | index_up[c_up] = i 94 | if c_up > s_up + 1: 95 | slope_up[c_up] = (y_up[i] - y_up[index_up[c_up - 1]]) / ( 96 | i - index_up[c_up - 1] 97 | ) 98 | else: 99 | slope_up[c_up] = (y_up[i] - z[c]) / (i - index[c]) 100 | 101 | while ( 102 | (c_low == s_low + 1) 103 | and (c_up > s_up + 1) 104 | and (slope_low[c_low] >= slope_up[s_up + 1]) 105 | ): 106 | c += 1 107 | s_up += 1 108 | index[c] = index_up[s_up] 109 | z[c] = y_up[index[c]] 110 | index_low[s_low] = index[c] 111 | slope_low[c_low] = (y_low[i] - z[c]) / (i - index[c]) 112 | while ( 113 | (c_up == s_up + 1) 114 | and (c_low > s_low + 1) 115 | and (slope_up[c_up] <= slope_low[s_low + 1]) 116 | ): 117 | c += 1 118 | s_low += 1 119 | index[c] = index_low[s_low] 120 | z[c] = y_low[index[c]] 121 | index_up[s_up] = index[c] 122 | slope_up[c_up] = (y_up[i] - z[c]) / (i - index[c]) 123 | 124 | for i in range(1, c_low - s_low + 1): 125 | index[c + i] = index_low[s_low + i] 126 | z[c + i] = y_low[index[c + i]] 127 | c = c + c_low - s_low 128 | j, i = 0, 1 129 | while i <= c: 130 | a = (z[i] - z[i - 1]) / (index[i] - index[i - 1]) 131 | while j < index[i]: 132 | output[j * incr] = a 133 | output[j * incr] = a 134 | j += 1 135 | i += 1 136 | return 137 | 138 | 139 | @utils.njit 140 | def prox_tv1d_cols(stepsize, a, n_rows, n_cols): 141 | """apply prox_tv1d along columns of the matri a 142 | """ 143 | A = a.reshape((n_rows, n_cols)) 144 | out = np.empty_like(A) 145 | for i in range(n_cols): 146 | _prox_tv1d(stepsize, A[:, i], out[:, i]) 147 | return out.ravel() 148 | 149 | 150 | @utils.njit 151 | def prox_tv1d_rows(stepsize, a, n_rows, n_cols): 152 | """apply prox_tv1d along rows of the matri a 153 | """ 154 | A = a.reshape((n_rows, n_cols)) 155 | out = np.empty_like(A) 156 | for i in range(n_rows): 157 | _prox_tv1d(stepsize, A[i, :], out[i, :]) 158 | return out.ravel() 159 | 160 | 161 | def c_prox_tv2d(step_size, x, n_rows, n_cols, max_iter, tol): 162 | """ 163 | Proximal Dykstra to minimize a 2-dimensional total variation. 164 | 165 | Reference: Algorithm 7 in https://arxiv.org/abs/1411.0589 166 | """ 167 | n_features = n_rows * n_cols 168 | p = np.zeros(n_features) 169 | q = np.zeros(n_features) 170 | 171 | for it in range(max_iter): 172 | y = x + p 173 | y = prox_tv1d_cols(step_size, y, n_rows, n_cols) 174 | p += x - y 175 | x = y + q 176 | x = prox_tv1d_rows(step_size, x, n_rows, n_cols) 177 | q += y - x 178 | 179 | # check convergence 180 | accuracy = np.max(np.abs(y - x)) 181 | if accuracy < tol: 182 | break 183 | else: 184 | warnings.warn( 185 | "prox_tv2d did not converged to desired accuracy\n" 186 | + "Accuracy reached: %s" % accuracy 187 | ) 188 | return x 189 | 190 | 191 | def prox_tv2d(w, step_size, n_rows, n_cols, max_iter=500, tol=1e-6): 192 | """ 193 | Computes the proximal operator of the 2-dimensional total variation operator. 194 | 195 | This solves a problem of the form 196 | 197 | argmin_x TV(x) + (1/(2 stepsize)) ||x - w||^2 198 | 199 | where TV(x) is the two-dimensional total variation. It does so using the 200 | Douglas-Rachford algorithm [Barbero and Sra, 2014]. 201 | 202 | Parameters 203 | ---------- 204 | w: array 205 | vector of coefficients 206 | 207 | step_size: float 208 | step size (often denoted gamma) in proximal objective function 209 | 210 | max_iter: int 211 | 212 | tol: float 213 | 214 | References 215 | ---------- 216 | Condat, Laurent. "A direct algorithm for 1D total variation denoising." 217 | IEEE Signal Processing Letters (2013) 218 | 219 | Barbero, Alvaro, and Suvrit Sra. "Modular proximal optimization for 220 | multidimensional total-variation regularization." arXiv preprint 221 | arXiv:1411.0589 (2014). 222 | """ 223 | 224 | x = w.copy().astype(np.float64) 225 | return c_prox_tv2d(step_size, x, n_rows, n_cols, max_iter, tol) 226 | 227 | 228 | def tv2d_linear_operator(n_rows, n_cols): 229 | """ 230 | Return the linear operator L such ||L x||_1 is the 2D total variation norm. 231 | 232 | Parameters 233 | ---------- 234 | n_rows 235 | n_cols 236 | 237 | Returns 238 | ------- 239 | 240 | """ 241 | 242 | L = [] 243 | for i in range(n_rows): 244 | for j in range(n_cols): 245 | if i < n_rows - 1: 246 | tmp1 = np.zeros((n_rows, n_cols)) 247 | tmp1[i, j] = 1 248 | tmp1[i + 1, j] = -1 249 | L.append(tmp1.ravel()) 250 | 251 | if j < n_cols - 1: 252 | tmp2 = np.zeros((n_rows, n_cols)) 253 | tmp2[i, j] = 1 254 | tmp2[i, j + 1] = -1 255 | L.append(tmp2.ravel()) 256 | return np.array(L) 257 | -------------------------------------------------------------------------------- /copt/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | from scipy import optimize 4 | from datetime import datetime 5 | from sklearn.utils.extmath import safe_sparse_dot 6 | 7 | try: 8 | from numba import njit, prange 9 | except ImportError: 10 | from functools import wraps 11 | 12 | def njit(*args, **kw): 13 | if len(args) == 1 and len(kw) == 0 and hasattr(args[0], "__call__"): 14 | func = args[0] 15 | 16 | @wraps(func) 17 | def inner_function(*args, **kwargs): 18 | return func(*args, **kwargs) 19 | 20 | return inner_function 21 | else: 22 | 23 | def inner_function(function): 24 | @wraps(function) 25 | def wrapper(*args, **kwargs): 26 | return function(*args, **kwargs) 27 | 28 | return wrapper 29 | 30 | return inner_function 31 | 32 | prange = range 33 | 34 | 35 | def build_func_grad(jac, fun, args, eps): 36 | if not callable(jac): 37 | if bool(jac): 38 | fun = optimize._optimize.MemoizeJac(fun) 39 | jac = fun.derivative 40 | elif jac == "2-point": 41 | jac = None 42 | else: 43 | raise NotImplementedError("jac has unexpected value.") 44 | 45 | if jac is None: 46 | 47 | def func_and_grad(x): 48 | f = fun(x, *args) 49 | g = optimize._approx_fprime_helper(x, fun, eps, args=args, f0=f) 50 | 51 | else: 52 | 53 | def func_and_grad(x): 54 | f = fun(x, *args) 55 | g = jac(x, *args) 56 | return f, g 57 | return func_and_grad 58 | 59 | 60 | def safe_sparse_add(a, b): 61 | if sparse.issparse(a) and sparse.issparse(b): 62 | # both are sparse, keep the result sparse 63 | return a + b 64 | else: 65 | # one of them is non-sparse, convert 66 | # everything to dense. 67 | if sparse.issparse(a): 68 | a = a.toarray() 69 | if a.ndim == 2 and b.ndim == 1: 70 | b.ravel() 71 | elif sparse.issparse(b): 72 | b = b.toarray() 73 | if b.ndim == 2 and a.ndim == 1: 74 | b = b.ravel() 75 | return a + b 76 | 77 | 78 | @njit(parallel=True) 79 | def sample_batches(n_samples, n_batches, batch_size): 80 | idx = np.zeros(n_batches * batch_size, dtype=np.int32) 81 | for k in prange(n_batches): 82 | idx[k * batch_size:(k + 1) * batch_size] = np.random.choice(n_samples, size=batch_size, replace=False) 83 | return idx 84 | 85 | 86 | @njit(nogil=True) 87 | def fast_csr_vm(x, data, indptr, indices, d, idx): 88 | """ 89 | Returns the vector matrix product x * M[idx]. M is described 90 | in the csr format. 91 | 92 | Returns x * M[idx] 93 | 94 | x: 1-d iterable 95 | data: data field of a scipy.sparse.csr_matrix 96 | indptr: indptr field of a scipy.sparse.csr_matrix 97 | indices: indices field of a scipy.sparse.csr_matrix 98 | d: output dimension 99 | idx: 1-d iterable: index of the sparse.csr_matrix 100 | """ 101 | res = np.zeros(d) 102 | assert x.shape[0] == len(idx) 103 | for k, i in np.ndenumerate(idx): 104 | for j in range(indptr[i], indptr[i+1]): 105 | j_idx = indices[j] 106 | res[j_idx] += x[k] * data[j] 107 | return res 108 | 109 | 110 | @njit(nogil=True) 111 | def fast_csr_mv(data, indptr, indices, x, idx): 112 | """ 113 | Returns the matrix vector product M[idx] * x. M is described 114 | in the csr format. 115 | 116 | data: data field of a scipy.sparse.csr_matrix 117 | indptr: indptr field of a scipy.sparse.csr_matrix 118 | indices: indices field of a scipy.sparse.csr_matrix 119 | x: 1-d iterable 120 | idx: 1-d iterable: index of the sparse.csr_matrix 121 | """ 122 | 123 | res = np.zeros(len(idx)) 124 | for i, row_idx in np.ndenumerate(idx): 125 | for k, j in enumerate(range(indptr[row_idx], indptr[row_idx+1])): 126 | j_idx = indices[j] 127 | res[i] += x[j_idx] * data[j] 128 | return res 129 | 130 | 131 | def parse_step_size(step_size): 132 | if hasattr(step_size, "__len__") and len(step_size) == 2: 133 | return step_size[0], step_size[1] 134 | elif isinstance(step_size, float): 135 | return step_size, "fixed" 136 | elif hasattr(step_size, "__call__") or step_size == "adaptive": 137 | # without other information start with a step-size of one 138 | return 1, step_size 139 | else: 140 | raise ValueError("Could not understand value step_size=%s" % step_size) 141 | 142 | 143 | class Trace: 144 | """Trace callback.""" 145 | def __init__(self, f=None, freq=1): 146 | self.trace_x = [] 147 | self.trace_time = [] 148 | self.trace_fx = [] 149 | self.trace_step_size = [] 150 | self.start = datetime.now() 151 | self._counter = 0 152 | self.freq = int(freq) 153 | self.f = f 154 | 155 | def __call__(self, dl): 156 | if self._counter % self.freq == 0: 157 | if self.f is not None: 158 | self.trace_fx.append(self.f(dl["x"])) 159 | else: 160 | self.trace_x.append(dl["x"].copy()) 161 | delta = (datetime.now() - self.start).total_seconds() 162 | self.trace_time.append(delta) 163 | self.trace_step_size.append(dl["step_size"]) 164 | self._counter += 1 165 | 166 | 167 | def init_lipschitz(f_grad, x0): 168 | L0 = 1e-3 169 | f0, grad0 = f_grad(x0) 170 | if sparse.issparse(grad0) and not sparse.issparse(x0): 171 | x0 = sparse.csc_matrix(x0).T 172 | elif sparse.issparse(x0) and not sparse.issparse(grad0): 173 | grad0 = sparse.csc_matrix(grad0).T 174 | x_tilde = x0 - (1.0 / L0) * grad0 175 | f_tilde = f_grad(x_tilde)[0] 176 | for _ in range(100): 177 | if f_tilde <= f0: 178 | break 179 | L0 *= 10 180 | x_tilde = x0 - (1.0 / L0) * grad0 181 | f_tilde = f_grad(x_tilde)[0] 182 | return L0 183 | 184 | 185 | def get_max_lipschitz(A, loss, alpha=0): 186 | """ 187 | XXX DEPRECATED 188 | 189 | Estimate the max Lipschitz constant (as appears in 190 | many stochastic methods). 191 | 192 | A : array-like 193 | 194 | loss : {"logloss", "square", "huber"} 195 | """ 196 | from sklearn.utils.extmath import row_norms 197 | 198 | max_squared_sum = row_norms(A, squared=True).max() 199 | 200 | if loss == "logloss": 201 | return 0.25 * max_squared_sum + alpha 202 | elif loss in ("huber", "square"): 203 | raise NotImplementedError 204 | raise NotImplementedError 205 | 206 | 207 | -------------------------------------------------------------------------------- /copt/utils_pytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def make_func_and_grad(loss_func, shape, device, dtype=None): 4 | """Wraps loss_func to take and return numpy 1D arrays, for interfacing PyTorch and copt. 5 | 6 | Args: 7 | loss_func: callable 8 | PyTorch callable, taking a torch.Tensor a input, and returning a scalar 9 | 10 | shape: tuple(*int) 11 | shape of the optimization variable, as input to loss_func 12 | 13 | device: torch.Device 14 | device on which to send the optimization variable 15 | 16 | dtype: dtype 17 | data type for the torch.Tensor holding the optimization variable 18 | 19 | Returns: 20 | f_grad: callable 21 | function taking a 1D numpy array as input and returning (loss_val, grad_val): (float, array). 22 | """ 23 | def func_and_grad(x, return_gradient=True): 24 | x_tensor = torch.tensor(x, dtype=dtype) 25 | x_tensor = x_tensor.view(*shape) 26 | x_tensor = x_tensor.to(device) 27 | x_tensor.requires_grad = True 28 | 29 | loss = loss_func(x_tensor) 30 | loss.backward() 31 | if return_gradient: 32 | return loss.item(), x_tensor.grad.cpu().numpy().flatten() 33 | 34 | return loss.item() 35 | return func_and_grad 36 | 37 | # TODO: write generic function wrapping copt optimizers for taking pytorch input, 38 | # returning pytorch output for use of copt in a PyTorch pipeline -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | -rm -rf $(BUILDDIR)/* 51 | -rm -rf auto_examples/ 52 | -rm -rf generated/* 53 | -rm -rf modules/generated/* 54 | 55 | html: 56 | # These two lines make the build a bit more lengthy, and the 57 | # the embedding of images more robust 58 | rm -rf $(BUILDDIR)/html/_images 59 | #rm -rf _build/doctrees/ 60 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 63 | 64 | dirhtml: 65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 68 | 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | htmlhelp: 85 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 86 | @echo 87 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 88 | ".hhp project file in $(BUILDDIR)/htmlhelp." 89 | 90 | qthelp: 91 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 92 | @echo 93 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 94 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 95 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp" 96 | @echo "To view the help file:" 97 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc" 98 | 99 | devhelp: 100 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 101 | @echo 102 | @echo "Build finished." 103 | @echo "To view the help file:" 104 | @echo "# mkdir -p $$HOME/.local/share/devhelp/project-template" 105 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template" 106 | @echo "# devhelp" 107 | 108 | epub: 109 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 110 | @echo 111 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 112 | 113 | latex: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo 116 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 117 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 118 | "(use \`make latexpdf' here to do that automatically)." 119 | 120 | latexpdf: 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | @echo "Running LaTeX files through pdflatex..." 123 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 124 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 125 | 126 | latexpdfja: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo "Running LaTeX files through platex and dvipdfmx..." 129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 131 | 132 | text: 133 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 134 | @echo 135 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 136 | 137 | man: 138 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 139 | @echo 140 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 141 | 142 | texinfo: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo 145 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 146 | @echo "Run \`make' in that directory to run these through makeinfo" \ 147 | "(use \`make info' here to do that automatically)." 148 | 149 | info: 150 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 151 | @echo "Running Texinfo files through makeinfo..." 152 | make -C $(BUILDDIR)/texinfo info 153 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 154 | 155 | gettext: 156 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 157 | @echo 158 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 159 | 160 | changes: 161 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 162 | @echo 163 | @echo "The overview file is in $(BUILDDIR)/changes." 164 | 165 | linkcheck: 166 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 167 | @echo 168 | @echo "Link check complete; look for any errors in the above output " \ 169 | "or in $(BUILDDIR)/linkcheck/output.txt." 170 | 171 | doctest: 172 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 173 | @echo "Testing of doctests in the sources finished, look at the " \ 174 | "results in $(BUILDDIR)/doctest/output.txt." 175 | 176 | xml: 177 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 178 | @echo 179 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 180 | 181 | pseudoxml: 182 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 183 | @echo 184 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 185 | 186 | upload: 187 | cp -r _build/html/* ~/dev/copt_web/ && cd ~/dev/copt_web/ && git add * && git ci -a -m "update doc" && git push origin gh-pages 188 | 189 | apidoc: 190 | sphinx-apidoc -o source/ ../copt 191 | 192 | html-noplot: 193 | $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 194 | @echo 195 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 196 | -------------------------------------------------------------------------------- /doc/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | 2 | /* this makes inline code look prettier with a slight border around it */ 3 | pre { 4 | border: 1px solid #CCC; 5 | } 6 | 7 | div.admonition-proximal-gradient, 8 | div.admonition-frank-wolfe, 9 | div.admonition-stochastic-methods, 10 | div.admonition-examples { 11 | background-color: #d9edf7; 12 | border-color: #bce8f1; 13 | } 14 | 15 | div.admonition-proximal-gradient a, 16 | div.admonition-frank-wolfe a, 17 | div.admonition-stochastic-methods a, 18 | div.admonition-examples a { 19 | color: #3E4349; 20 | } -------------------------------------------------------------------------------- /doc/citing.rst: -------------------------------------------------------------------------------- 1 | .. _citing: 2 | 3 | Citing 4 | ====== 5 | 6 | If you use this software in a scientific publication, please consider citing it as 7 | 8 | .. code:: 9 | 10 | @article{copt, 11 | author = {Fabian Pedregosa}, 12 | title = {C-OPT: composite optimization in Python}, 13 | year = 2018, 14 | DOI = {10.5281/zenodo.1283339}, 15 | url={http://openopt.github.io/copt/} 16 | } -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | COPT: a Python library for Constrained OPTimization 2 | =================================================== 3 | 4 | .. image:: https://travis-ci.org/openopt/copt.svg?branch=master 5 | :target: https://travis-ci.org/openopt/copt 6 | .. image:: https://storage.googleapis.com/copt-doc/doc_status.svg 7 | :target: https://storage.googleapis.com/copt-doc/index.html 8 | .. image:: https://coveralls.io/repos/github/openopt/copt/badge.svg?branch=master 9 | :target: https://coveralls.io/github/openopt/copt?branch=master 10 | .. image:: https://storage.googleapis.com/copt-doc/pylint.svg 11 | :target: https://storage.googleapis.com/copt-doc/pylint.txt 12 | .. image:: https://zenodo.org/badge/46262908.svg 13 | :target: citing.html 14 | 15 | 16 | 17 | Life is too short to learn another API 18 | -------------------------------------- 19 | 20 | COPT is an optimization library that does not reinvent the wheel. It packs classical optimization algorithms in an API following that of `scipy.optimize `_. So if you've already used that library, you should feel right at ease. 21 | 22 | It provides: 23 | 24 | * State of the art implementation of classical optimization algorithms such as :ref:`proximal gradient descent ` and :ref:`Frank-Wolfe ` under a consistent API. 25 | * Few dependencies, pure Python library for easy deployment. 26 | * An :ref:`example gallery `. 27 | 28 | 29 | 30 | Contents 31 | ----------------------- 32 | 33 | The methods implements in copt can be categorized as: 34 | 35 | .. admonition:: Proximal-gradient 36 | 37 | These are methods that combine the gradient of a smooth term with the proximal operator of a potentially non-smooth term. 38 | They can be used to solve problems involving one or several non-smooth terms. :ref:`Read more ...` 39 | 40 | .. admonition:: Frank-Wolfe 41 | 42 | Frank-Wolfe, also known as conditional gradient, are a family of methods to solve constrained optimization problems. Contrary to proximal-gradient methods, they don't require access to the projection onto the constraint set. :ref:`Read more ...` 43 | 44 | 45 | .. admonition:: Stochastic Methods 46 | 47 | Methods that can solve optimization problems with access only to a noisy evaluation of the objective. 48 | :ref:`Read more ...`. 49 | 50 | 51 | Installation 52 | ------------ 53 | 54 | If you already have a working installation of numpy and scipy, 55 | the easiest way to install copt is using ``pip`` :: 56 | 57 | pip install -U copt 58 | 59 | 60 | Alternatively, you can install the latest development from github with the command:: 61 | 62 | pip install git+https://github.com/openopt/copt.git 63 | 64 | 65 | 66 | Where to go from here? 67 | ---------------------- 68 | 69 | To know more about copt, check out our :ref:`example gallery ` or browse through the module reference using the left navigation bar. 70 | 71 | 72 | .. toctree:: 73 | :maxdepth: 2 74 | :hidden: 75 | 76 | solvers 77 | loss_functions 78 | auto_examples/index 79 | utils 80 | citing 81 | 82 | Last change: |today| 83 | -------------------------------------------------------------------------------- /doc/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openopt/copt/c0d5d46ae709f77b7dc1fc692bbe476aa63f029b/doc/logo.png -------------------------------------------------------------------------------- /doc/loss_functions.rst: -------------------------------------------------------------------------------- 1 | 2 | Loss, constraints and regularizers 3 | ================================== 4 | 5 | These are some convenience functions that implement common losses, constraints and regularizers. 6 | 7 | Smooth loss functions: 8 | 9 | 10 | .. autosummary:: 11 | :toctree: generated/ 12 | 13 | copt.loss.LogLoss 14 | copt.loss.SquareLoss 15 | copt.loss.HuberLoss 16 | 17 | Non-smooth terms accessed through their proximal operator 18 | 19 | .. autosummary:: 20 | :toctree: generated/ 21 | 22 | copt.penalty.L1Norm 23 | copt.penalty.GroupL1 24 | copt.penalty.TraceNorm 25 | copt.penalty.FusedLasso 26 | copt.penalty.TotalVariation2D 27 | 28 | Constraints can be incorporated in a similar way through 29 | 30 | 31 | .. autosummary:: 32 | :toctree: generated/ 33 | 34 | copt.constraint.L1Ball 35 | copt.constraint.L2Ball 36 | copt.constraint.LinfBall 37 | copt.constraint.TraceBall 38 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/paper/biblio.bib: -------------------------------------------------------------------------------- 1 | @article{virtanen2019scipy, 2 | title={SciPy 1.0--Fundamental Algorithms for Scientific Computing in Python}, 3 | author={Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E and Haberland, Matt and Reddy, Tyler and Cournapeau, David and Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and Bright, Jonathan and others}, 4 | journal={arXiv preprint arXiv:1907.10121}, 5 | year={2019} 6 | } 7 | 8 | @article{pedregosa2011scikit, 9 | title={Scikit-learn: Machine learning in Python}, 10 | author={Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others}, 11 | journal={Journal of machine learning research}, 12 | volume={12}, 13 | number={Oct}, 14 | pages={2825--2830}, 15 | year={2011} 16 | } -------------------------------------------------------------------------------- /doc/paper/index.tex: -------------------------------------------------------------------------------- 1 | \documentclass[twoside,11pt]{article} 2 | 3 | \usepackage[nohyperref, preprint]{jmlr2e} 4 | 5 | \usepackage{amsmath} 6 | % \usepackage{amsthm} 7 | \usepackage{amssymb} 8 | \usepackage{empheq} 9 | \usepackage{xcolor, color, colortbl} 10 | \usepackage{mdframed} 11 | \usepackage{pifont} 12 | \newcommand{\cmark}{\ding{51}}% 13 | \newcommand{\xmark}{\ding{55}}% 14 | \usepackage{enumitem} 15 | % For figures 16 | \usepackage{graphicx} % more modern 17 | 18 | 19 | \newcommand{\blue}{\color{blue}} 20 | 21 | \definecolor{mydarkblue}{rgb}{0,0.08,0.45} 22 | \usepackage[colorlinks=true, 23 | linkcolor=mydarkblue, 24 | citecolor=mydarkblue, 25 | filecolor=mydarkblue, 26 | urlcolor=mydarkblue, 27 | pdfview=FitH]{hyperref} 28 | 29 | \graphicspath{{./figures/}} 30 | 31 | 32 | \jmlrheading{1}{2019}{1-48}{4/00}{10/00}{X}{Authors} 33 | 34 | % Short headings should be running head and authors last names 35 | 36 | \ShortHeadings{C-OPT: Composite Optimization in Python}{Pedregosa} 37 | \firstpageno{1} 38 | 39 | 40 | \begin{document} 41 | 42 | \title{C-OPT: Composite Optimization in Python} 43 | \author{\name Fabian Pedregosa \email pedregosa@google.com \\ 44 | \addr Google Research\\ 45 | } 46 | \editor{} 47 | 48 | 49 | \maketitle 50 | 51 | 52 | \begin{abstract} 53 | \emph{copt} is a Python library integrating a wide range of classical optimization algorithm for medium-scale problems. By packaging a wide array of optimization algorithms into a consistent API, this package focuses on brining optimization algorithms to practitioners. Emphasis is on robustness, doocumentation, performance and API consistency. It has minimal dependencies and is distributed under the Apache-2.0 license, encouraging its use in both academic and commercial settings. 54 | \end{abstract} 55 | 56 | \begin{keywords} 57 | optimization, python 58 | \end{keywords} 59 | 60 | \section{Introduction} 61 | 62 | {\blue Big environment Python for scientific computing. \newcommand{\blue}{\color{blue}} 63 | } 64 | 65 | \section{Project Vision} 66 | 67 | \paragraph{Code quality.} 68 | 69 | \paragraph{Bare-bones design and API.} 70 | 71 | \paragraph{Documentation.} 72 | 73 | \paragraph{Apache license.} 74 | 75 | 76 | \section{Underlying Technologies} 77 | 78 | \citep{virtanen2019scipy} 79 | 80 | \citep{pedregosa2011scikit} 81 | 82 | \section{Computational} 83 | 84 | \bibliography{biblio} 85 | 86 | 87 | \end{document} 88 | -------------------------------------------------------------------------------- /doc/solvers.rst: -------------------------------------------------------------------------------- 1 | .. _proximal_gradient: 2 | 3 | Solvers 4 | ======= 5 | 6 | 7 | Proximal-Gradient 8 | ----------------- 9 | 10 | .. autosummary:: 11 | :toctree: generated/ 12 | 13 | copt.minimize_proximal_gradient 14 | 15 | The proximal-gradient method [BT2009]_, [N2013]_ is a method to solve problems of the form 16 | 17 | .. math:: 18 | \argmin_{\bs{x} \in \mathbb{R}^d} f(\bs{x}) + g(\bs{x}) 19 | 20 | 21 | where $f$ is a differentiable function for which we have access to its gradient and $g$ is a potentially non-smooth function for which we have access to its proximal operator. 22 | 23 | 24 | .. admonition:: Examples 25 | 26 | * :ref:`sphx_glr_auto_examples_plot_group_lasso.py` 27 | 28 | 29 | .. topic:: References 30 | 31 | .. [BT2009] Beck, Amir, and Marc Teboulle. `"Gradient-based algorithms with applications to signal recovery." `_ Convex optimization in signal processing and communications (2009) 32 | 33 | .. [N2013] Nesterov, Yu. `"Gradient methods for minimizing composite functions." `_ Mathematical Programming 140.1 (2013): 125-161. 34 | 35 | 36 | Primal-dual hybrid gradient 37 | --------------------------- 38 | 39 | .. autosummary:: 40 | :toctree: generated/ 41 | 42 | copt.minimize_primal_dual 43 | 44 | 45 | The primal-dual hybrid gradient method [C2013]_ [V2013]_ [CP2016]_ is a method to solve problems of the form 46 | 47 | .. math:: 48 | \argmin_{\bs{x} \in \mathbb{R}^d} f(\bs{x}) + g(\bs{x}) + h(\bs{A}\bs{x}) 49 | 50 | where $f$ is a differentiable function for which we have access to its gradient and $g$ and $h$ are potentially non-smooth functions for which we have access to their proximal operator. 51 | 52 | 53 | 54 | .. admonition:: Examples 55 | 56 | * :ref:`sphx_glr_auto_examples_proximal_splitting_plot_tv_deblurring.py` 57 | * :ref:`sphx_glr_auto_examples_proximal_splitting_plot_overlapping_group_lasso.py` 58 | 59 | 60 | .. topic:: References 61 | 62 | .. [C2013] Condat, Laurent. "A primal–dual splitting method for convex optimization involving Lipschitzian, proximable and linear composite terms." Journal of Optimization Theory and Applications 158.2 (2013): 460-479. 63 | 64 | .. [V2013] Vũ, Bằng Công. "A splitting algorithm for dual monotone inclusions involving cocoercive operators." Advances in Computational Mathematics 38.3 (2013) 65 | 66 | .. [CP2016] Chambolle, Antonin, and Thomas Pock. "An introduction to continuous optimization for imaging." Acta Numerica 25 (2016) 67 | 68 | 69 | Three-operator splitting 70 | ------------------------ 71 | 72 | 73 | .. autosummary:: 74 | :toctree: generated/ 75 | 76 | copt.minimize_three_split 77 | 78 | 79 | The three operator splitting [DY2017]_ [PG2018]_ is a method to solve problems of the form 80 | 81 | .. math:: 82 | \argmin_{\bs{x} \in \mathbb{R}^d} f(\bs{x}) + g(\bs{x}) + h(\bs{x}) 83 | 84 | where $f$ is a differentiable function for which we have access to its gradient and $g$ and $h$ are potentially non-smooth functions for which we have access to their proximal operator. 85 | 86 | 87 | .. admonition:: Examples 88 | 89 | * :ref:`sphx_glr_auto_examples_proximal_splitting_plot_sparse_nuclear_norm.py` 90 | * :ref:`sphx_glr_auto_examples_proximal_splitting_plot_tv_deblurring.py` 91 | * :ref:`sphx_glr_auto_examples_proximal_splitting_plot_overlapping_group_lasso.py` 92 | 93 | 94 | .. topic:: References 95 | 96 | .. [DY2017] Davis, Damek, and Wotao Yin. `"A three-operator splitting scheme and 97 | its optimization applications." 98 | `_ Set-Valued and Variational 99 | Analysis, 2017. 100 | 101 | .. [PG2018] Pedregosa, Fabian, and Gauthier Gidel. `"Adaptive Three Operator 102 | Splitting." `_ Proceedings of the 35th 103 | International Conference on Machine Learning, 2018. 104 | 105 | 106 | .. _frank_wolfe: 107 | 108 | Frank-Wolfe 109 | ----------- 110 | 111 | .. autosummary:: 112 | :toctree: generated/ 113 | 114 | copt.minimize_frank_wolfe 115 | 116 | 117 | The Frank-Wolfe (FW) or conditional gradient algorithm [J2003]_, [P2018]_, [PANJ2018]_ is a method for constrained optimization. It can solve problems of the form 118 | 119 | .. math:: 120 | \argmin_{\bs{x} \in \mathcal{D}} f(\bs{x}) 121 | 122 | where :math:`f` is a differentiable function for which we have access to its gradient and :math:`\mathcal{D}` is a compact set for which we have access to its linear minimization oracle (lmo). This is a routine that given a vector :math:`\bs{u}` returns a solution to 123 | 124 | .. math:: 125 | \argmin_{\bs{x} \in D}\, \langle\bs{u}, \bs{x}\rangle~. 126 | 127 | 128 | Contrary to other constrained optimization algorithms like projected gradient descent, the Frank-Wolfe algorithm does not require access to a projection, hence why it is sometimes referred to as a projection-free algorithm. It instead relies exclusively on the linear minimization oracle described above. 129 | 130 | 131 | .. TODO describe the LMO API in more detail 132 | 133 | 134 | The Frank-Wolfe algorithm is implemented in this library in the method :meth:`copt.minimize_frank_wolfe`. As most other methods it takes as argument an objective function to minimize, but unlike most other methods, it requires access to a *linear minimization oracle*, which is a routine that for a given $d$-dimensional vector :math:`\bs{u}` solves the linear problems :math:`\argmin_{\bs{z} \in D}\, \langle \bs{u}, \bs{z}\rangle`. 135 | 136 | 137 | At each iteration, the Frank-Wolfe algorithm uses the linear minimization oracle to identify the vertex :math:`\bs{s}_t` that correlates most with the negative gradient. Then next iterate :math:`\bs{x}^+` is constructed as a convex combination of the current iterate :math:`\bs{x}` and the newly acquired vertex :math:`\bs{s}`: 138 | 139 | 140 | .. math:: 141 | \boldsymbol{x}^+ = (1 - \gamma)\boldsymbol{x} + \gamma \boldsymbol{s} 142 | 143 | 144 | 145 | The step-size :math:`\gamma` can be chosen by different strategies: 146 | 147 | * **Backtracking line-search**. This is the default option and corresponds to the keyword argument :code:`step_size="backtracking"` This is typically the fastest and simplest method, if unsure, use this option. 148 | 149 | * **Demyanov-Rubinov step-size**. This is a step-size of the form 150 | 151 | .. math:: 152 | \gamma = \langle \nabla f(\bs{x}), \bs{s} - \bs{x}\rangle / (L \|\bs{s} - \bs{x}\|^2)~. 153 | 154 | 155 | 156 | This step-size typically performs well but has the drawback that it requires knowledge of the Lipschitz constant of :math:`\nabla f`. This step-size can be used with the keyword argument :code:`step_size="DR"`. In this case the Lipschitz 157 | constant :math:`L` needs to be specified through the keyword argument :code:`lipschitz`. For example, if the lipschitz constant is 0.1, then the signature should include :code:`step_size="DR", lipschitz=0.1`. 158 | 159 | 160 | * **Oblivious step-size**. This is the very simple step-size of the form 161 | 162 | .. math:: 163 | \gamma = \frac{2}{t+2}~, 164 | 165 | where :math:`t` is the number of iterations. This step-size is oblivious since it doesn't use any previous information of the objective. It typically performs worst than the alternatives, but is simple to implement and can be competitive in the case in the case of noisy objectives. 166 | 167 | 168 | .. admonition:: Examples 169 | 170 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark.py` 171 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_vertex_overlap.py` 172 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sparse_benchmark_pairwise.py` 173 | 174 | 175 | 176 | .. topic:: References: 177 | 178 | .. [J2003] Jaggi, Martin. `"Revisiting Frank-Wolfe: Projection-Free Sparse Convex Optimization." `_ ICML 2013. 179 | 180 | .. [P2018] Pedregosa, Fabian `"Notes on the Frank-Wolfe Algorithm" `_, 2018 181 | 182 | .. [PANJ2018] Pedregosa, Fabian, Armin Askari, Geoffrey Negiar, and Martin Jaggi. `"Step-Size Adaptivity in Projection-Free Optimization." `_ arXiv:1806.05123 (2018). 183 | 184 | 185 | .. [LJ2015] Lacoste-Julien, Simon, and Martin Jaggi. `"On the global linear convergence of Frank-Wolfe optimization variants." `_ Advances in Neural Information Processing Systems. 2015. 186 | 187 | 188 | 189 | 190 | .. _stochastic_methods: 191 | 192 | Stochastic methods 193 | ------------------ 194 | 195 | .. autosummary:: 196 | :toctree: generated/ 197 | 198 | copt.minimize_saga 199 | copt.minimize_svrg 200 | copt.minimize_vrtos 201 | copt.minimize_sfw 202 | 203 | 204 | .. topic:: Examples: 205 | 206 | * :ref:`sphx_glr_auto_examples_plot_saga_vs_svrg.py` 207 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sfw.py` 208 | * :ref:`sphx_glr_auto_examples_frank_wolfe_plot_sfw_real_data.py`: 209 | 210 | -------------------------------------------------------------------------------- /doc/sphinx_ext/github_link.py: -------------------------------------------------------------------------------- 1 | from operator import attrgetter 2 | import inspect 3 | import subprocess 4 | import os 5 | import sys 6 | from functools import partial 7 | 8 | REVISION_CMD = 'git rev-parse --short HEAD' 9 | 10 | 11 | def _get_git_revision(): 12 | try: 13 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 14 | except (subprocess.CalledProcessError, OSError): 15 | print('Failed to execute git to get revision') 16 | return None 17 | return revision.decode('utf-8') 18 | 19 | 20 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 21 | """Determine a link to online source for a class/method/function 22 | 23 | This is called by sphinx.ext.linkcode 24 | 25 | An example with a long-untouched module that everyone has 26 | >>> _linkcode_resolve('py', {'module': 'tty', 27 | ... 'fullname': 'setraw'}, 28 | ... package='tty', 29 | ... url_fmt='http://hg.python.org/cpython/file/' 30 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 31 | ... revision='xxxx') 32 | 'http://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 33 | """ 34 | 35 | if revision is None: 36 | return 37 | if domain not in ('py', 'pyx'): 38 | return 39 | if not info.get('module') or not info.get('fullname'): 40 | return 41 | 42 | class_name = info['fullname'].split('.')[0] 43 | if type(class_name) != str: 44 | # Python 2 only 45 | class_name = class_name.encode('utf-8') 46 | module = __import__(info['module'], fromlist=[class_name]) 47 | obj = attrgetter(info['fullname'])(module) 48 | 49 | try: 50 | fn = inspect.getsourcefile(obj) 51 | except Exception: 52 | fn = None 53 | if not fn: 54 | try: 55 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 56 | except Exception: 57 | fn = None 58 | if not fn: 59 | return 60 | 61 | fn = os.path.relpath(fn, 62 | start=os.path.dirname(__import__(package).__file__)) 63 | try: 64 | lineno = inspect.getsourcelines(obj)[1] 65 | except Exception: 66 | lineno = '' 67 | return url_fmt.format(revision=revision, package=package, 68 | path=fn, lineno=lineno) 69 | 70 | 71 | def make_linkcode_resolve(package, url_fmt): 72 | """Returns a linkcode_resolve function for the given URL format 73 | 74 | revision is a git commit reference (hash or name) 75 | 76 | package is the name of the root module of the package 77 | 78 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 79 | 'blob/{revision}/{package}/' 80 | '{path}#L{lineno}') 81 | """ 82 | revision = _get_git_revision() 83 | return partial(_linkcode_resolve, revision=revision, package=package, 84 | url_fmt=url_fmt) -------------------------------------------------------------------------------- /doc/utils.rst: -------------------------------------------------------------------------------- 1 | Utility functions 2 | ================= 3 | 4 | Datasets 5 | -------- 6 | 7 | .. autosummary:: 8 | :toctree: generated/ 9 | 10 | copt.datasets.load_img1 11 | copt.datasets.load_rcv1 12 | copt.datasets.load_url 13 | copt.datasets.load_covtype 14 | copt.datasets.load_gisette 15 | copt.datasets.load_madelon 16 | 17 | Misc 18 | ---- 19 | 20 | .. autosummary:: 21 | :toctree: generated/ 22 | 23 | copt.utils.Trace 24 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | Example Gallery 4 | =============== 5 | 6 | Miscellaneous examples 7 | ---------------------- 8 | 9 | Miscellaneous and introductory examples for copt. -------------------------------------------------------------------------------- /examples/frank_wolfe/README.txt: -------------------------------------------------------------------------------- 1 | .. _frank_wolfe_examples: 2 | 3 | Frank-Wolfe 4 | ----------- 5 | 6 | Examples based on the Frank-Wolfe algorithm 7 | -------------------------------------------------------------------------------- /examples/frank_wolfe/plot_sfw.py: -------------------------------------------------------------------------------- 1 | """ 2 | Comparison of variants of Stochastic FW 3 | =========================================== 4 | 5 | The problem solved in this case is a L1 constrained logistic regression 6 | (sometimes referred to as sparse logistic regression). 7 | """ 8 | 9 | import copt as cp 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import sklearn 13 | 14 | 15 | # .. construct (random) dataset .. 16 | import copt 17 | 18 | n_samples, n_features = 500, 200 19 | np.random.seed(0) 20 | X = np.random.randn(n_samples, n_features) 21 | y = np.random.rand(n_samples) 22 | batch_size = n_samples // 10 23 | n_batches = n_samples // batch_size 24 | max_iter = int(1e3) 25 | freq = max(n_batches, max_iter // 1000) 26 | 27 | # .. objective function and regularizer .. 28 | f = copt.loss.LogLoss(X, y) 29 | alpha = 1. 30 | constraint = copt.constraint.L1Ball(1.) 31 | 32 | x0 = [0] * n_features 33 | x0[0] += alpha 34 | 35 | # .. callbacks to track progress .. 36 | def fw_gap(x): 37 | _, grad = f.f_grad(x) 38 | return constraint.lmo(-grad, x)[0].dot(-grad) 39 | 40 | 41 | class TraceGaps(cp.utils.Trace): 42 | def __init__(self, f=None, freq=1): 43 | super(TraceGaps, self).__init__(f, freq) 44 | self.trace_gaps = [] 45 | 46 | def __call__(self, dl): 47 | if self._counter % self.freq == 0: 48 | self.trace_gaps.append(fw_gap(dl['x'])) 49 | super(TraceGaps, self).__call__(dl) 50 | 51 | 52 | cb_sfw_SAG = TraceGaps(f, freq=freq) 53 | cb_sfw_SAG_pairwise = TraceGaps(f, freq=freq) 54 | cb_sfw_SAGA = TraceGaps(f, freq=freq) 55 | cb_sfw_mokhtari = TraceGaps(f, freq=freq) 56 | cb_sfw_lu_freund = TraceGaps(f, freq=freq) 57 | 58 | # .. run the SFW algorithm .. 59 | print("Running SAGFW Pairwise with DR step size") 60 | result_sfw_SAG_pairwise = cp.minimize_sfw( 61 | f.partial_deriv, 62 | X, 63 | y, 64 | np.zeros(n_features), 65 | constraint.lmo_pairwise, 66 | batch_size=batch_size, 67 | x0_rep=(1., 0), 68 | callback=cb_sfw_SAG_pairwise, 69 | tol=0, 70 | max_iter=max_iter, 71 | variant='SAG', 72 | step_size='DR', 73 | lipschitz=f.max_lipschitz / n_samples, 74 | lmo_variant='pairwise' 75 | ) 76 | 77 | print("Running SAGFW") 78 | result_sfw_SAG = cp.minimize_sfw( 79 | f.partial_deriv, 80 | X, 81 | y, 82 | np.zeros(n_features), 83 | constraint.lmo, 84 | batch_size=batch_size, 85 | callback=cb_sfw_SAG, 86 | tol=0, 87 | max_iter=max_iter, 88 | variant='SAG' 89 | ) 90 | 91 | print("Running SAGAFW") 92 | result_sfw_SAGA = cp.minimize_sfw( 93 | f.partial_deriv, 94 | X, 95 | y, 96 | np.zeros(n_features), 97 | constraint.lmo, 98 | batch_size=batch_size, 99 | callback=cb_sfw_SAGA, 100 | tol=0, 101 | max_iter=max_iter, 102 | variant='SAGA' 103 | ) 104 | 105 | print("Running MHK") 106 | result_sfw_mokhtari = cp.minimize_sfw( 107 | f.partial_deriv, 108 | X, 109 | y, 110 | np.zeros(n_features), 111 | constraint.lmo, 112 | batch_size=batch_size, 113 | callback=cb_sfw_mokhtari, 114 | tol=0, 115 | max_iter=max_iter, 116 | variant='MHK' 117 | ) 118 | 119 | print("Running LF") 120 | result_sfw_lu_freund = cp.minimize_sfw( 121 | f.partial_deriv, 122 | X, 123 | y, 124 | np.zeros(n_features), 125 | constraint.lmo, 126 | batch_size=batch_size, 127 | callback=cb_sfw_lu_freund, 128 | tol=0, 129 | max_iter=max_iter, 130 | variant='LF' 131 | ) 132 | # .. plot the result .. 133 | max_gap = max(cb_sfw_SAG.trace_gaps[0], 134 | cb_sfw_SAG_pairwise.trace_gaps[0], 135 | cb_sfw_mokhtari.trace_gaps[0], 136 | cb_sfw_lu_freund.trace_gaps[0], 137 | cb_sfw_SAGA.trace_gaps[0]) 138 | 139 | max_val = max(cb_sfw_SAG.trace_fx[0], 140 | cb_sfw_SAG_pairwise.trace_fx[0], 141 | cb_sfw_mokhtari.trace_fx[0], 142 | cb_sfw_lu_freund.trace_fx[0], 143 | cb_sfw_SAGA.trace_fx[0]) 144 | 145 | min_val = min(np.min(cb_sfw_SAG.trace_fx), 146 | np.min(cb_sfw_SAG_pairwise.trace_fx), 147 | np.min(cb_sfw_mokhtari.trace_fx), 148 | np.min(cb_sfw_lu_freund.trace_fx), 149 | np.min(cb_sfw_SAGA.trace_fx), 150 | ) 151 | 152 | fig, (ax1, ax2) = plt.subplots(2, sharex=True) 153 | fig.suptitle('Stochastic Frank-Wolfe') 154 | 155 | ax1.plot(freq * batch_size * np.arange(len(cb_sfw_SAG.trace_gaps)), np.array(cb_sfw_SAG.trace_gaps) / max_gap, label="SAG") 156 | ax1.plot(freq * batch_size * np.arange(len(cb_sfw_SAG_pairwise.trace_gaps)), np.array(cb_sfw_SAG_pairwise.trace_gaps) / max_gap, label="SAG Pairwise") 157 | ax1.plot(freq * batch_size * np.arange(len(cb_sfw_SAGA.trace_gaps)), np.array(cb_sfw_SAGA.trace_gaps) / max_gap, label="SAGA") 158 | ax1.plot(freq * batch_size * np.arange(len(cb_sfw_mokhtari.trace_gaps)), np.array(cb_sfw_mokhtari.trace_gaps) / max_gap, label='Mokhtari et al. (2018)') 159 | ax1.plot(freq * batch_size * np.arange(len(cb_sfw_lu_freund.trace_gaps)), np.array(cb_sfw_lu_freund.trace_gaps) / max_gap, label='Lu and Freund (2018)') 160 | ax1.set_ylabel("Relative FW gap", fontweight="bold") 161 | ax1.set_yscale('log') 162 | ax1.set_xscale('log') 163 | ax1.grid(True) 164 | 165 | ax2.plot(freq * batch_size * np.arange(len(cb_sfw_SAG.trace_fx)), (np.array(cb_sfw_SAG.trace_fx) - min_val) / (max_val - min_val), label="SAG") 166 | ax2.plot(freq * batch_size * np.arange(len(cb_sfw_SAG_pairwise.trace_fx)), (np.array(cb_sfw_SAG_pairwise.trace_fx) - min_val) / (max_val - min_val), label="SAG Pairwise") 167 | ax2.plot(freq * batch_size * np.arange(len(cb_sfw_SAGA.trace_fx)), (np.array(cb_sfw_SAGA.trace_fx) - min_val) / (max_val - min_val), label="SAGA") 168 | ax2.plot(freq * batch_size * np.arange(len(cb_sfw_mokhtari.trace_fx)), (np.array(cb_sfw_mokhtari.trace_fx) - min_val) / (max_val - min_val), label='Mokhtari et al. (2018)') 169 | ax2.plot(freq * batch_size * np.arange(len(cb_sfw_lu_freund.trace_fx)), (np.array(cb_sfw_lu_freund.trace_fx) - min_val) / (max_val - min_val), label='Lu and Freund (2018)') 170 | ax2.set_ylabel("Relative suboptimality", fontweight="bold") 171 | ax2.set_xlabel("Number of gradient evaluations", fontweight="bold") 172 | ax2.set_yscale('log') 173 | ax2.set_xscale("log") 174 | ax2.grid(True) 175 | 176 | plt.xlim(1e4, 4e8) 177 | plt.legend() 178 | plt.show() 179 | -------------------------------------------------------------------------------- /examples/frank_wolfe/plot_sfw_real_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Comparison of variants of Stochastic FW on real data 3 | ==================================================== 4 | 5 | The problem solved in this case is a L1 constrained logistic regression 6 | (sometimes referred to as sparse logistic regression). 7 | """ 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | import copt as cp 12 | 13 | # .. Load dataset .. 14 | import copt.constraint 15 | import copt.loss 16 | 17 | np.random.seed(0) 18 | X, y = cp.datasets.load_rcv1("train") 19 | dataset_name = "RCV1" 20 | n_samples, n_features = X.shape 21 | batch_size = 500 22 | max_iter = int(1e4) 23 | freq = max(n_samples // (batch_size * 2), 1) 24 | 25 | # .. objective function and regularizer .. 26 | f = copt.loss.LogLoss(X, y) 27 | constraint = copt.constraint.L1Ball(2e3) 28 | 29 | # .. callbacks to track progress .. 30 | def fw_gap(x): 31 | _, grad = f.f_grad(x) 32 | return constraint.lmo(-grad, x)[0].dot(-grad) 33 | 34 | 35 | class TraceGaps(cp.utils.Trace): 36 | def __init__(self, f=None, freq=1): 37 | super(TraceGaps, self).__init__(f, freq) 38 | self.trace_gaps = [] 39 | 40 | def __call__(self, dl): 41 | if self._counter % self.freq == 0: 42 | self.trace_gaps.append(fw_gap(dl['x'])) 43 | super(TraceGaps, self).__call__(dl) 44 | 45 | 46 | cb_SAG = TraceGaps(f, freq=freq) 47 | cb_MHK = TraceGaps(f, freq=freq) 48 | cb_LF = TraceGaps(f, freq=freq) 49 | 50 | 51 | # .. run the SFW algorithm .. 52 | print("Running SAGFW") 53 | result_SAG = cp.minimize_sfw( 54 | f.partial_deriv, 55 | X, 56 | y, 57 | np.zeros(n_features), 58 | constraint.lmo, 59 | batch_size, 60 | callback=cb_SAG, 61 | tol=0, 62 | max_iter=max_iter, 63 | variant='SAG' 64 | ) 65 | 66 | print("Running MHK") 67 | result_MHK = cp.minimize_sfw( 68 | f.partial_deriv, 69 | X, 70 | y, 71 | np.zeros(n_features), 72 | constraint.lmo, 73 | batch_size, 74 | callback=cb_MHK, 75 | tol=0, 76 | max_iter=max_iter, 77 | variant='MHK' 78 | ) 79 | 80 | print("Running LF") 81 | result_LF = cp.minimize_sfw( 82 | f.partial_deriv, 83 | X, 84 | y, 85 | np.zeros(n_features), 86 | constraint.lmo, 87 | batch_size, 88 | callback=cb_LF, 89 | tol=0, 90 | max_iter=max_iter, 91 | variant='LF' 92 | ) 93 | 94 | print("Plotting...") 95 | # .. plot the result .. 96 | max_gap = max(cb_SAG.trace_gaps[0], 97 | cb_MHK.trace_gaps[0], 98 | cb_LF.trace_gaps[0], 99 | ) 100 | 101 | max_val = max(np.max(cb_SAG.trace_fx), 102 | np.max(cb_MHK.trace_fx), 103 | np.max(cb_LF.trace_fx), 104 | ) 105 | 106 | min_val = min(np.min(cb_SAG.trace_fx), 107 | np.min(cb_MHK.trace_fx), 108 | np.min(cb_LF.trace_fx), 109 | ) 110 | 111 | 112 | fig, (ax1, ax2) = plt.subplots(2, sharex=True) 113 | 114 | ax1.set_title("Sparse Logistic Regression -- {}".format(dataset_name), fontweight="bold") 115 | ax1.plot(batch_size * freq * np.arange(len(cb_LF.trace_gaps)), np.array(cb_LF.trace_gaps) / max_gap, label='SFW -- Lu and Freund (2020)') 116 | ax1.plot(batch_size * freq * np.arange(len(cb_MHK.trace_gaps)), np.array(cb_MHK.trace_gaps) / max_gap, label='SFW -- Mokhtari et al. (2020)') 117 | ax1.plot(batch_size * freq * np.arange(len(cb_SAG.trace_gaps)), np.array(cb_SAG.trace_gaps) / max_gap, label="SFW -- Negiar et al. (2020)") 118 | ax1.set_ylabel("Relative FW gap", fontweight="bold") 119 | ax1.set_xscale('log') 120 | ax1.set_yscale('log') 121 | ax1.grid() 122 | 123 | 124 | ax2.plot(batch_size * freq * np.arange(len(cb_LF.trace_fx)), (np.array(cb_LF.trace_fx) - min_val) / (max_val - min_val), label='SFW -- Lu and Freund (2020)') 125 | ax2.plot(batch_size * freq * np.arange(len(cb_MHK.trace_fx)), (np.array(cb_MHK.trace_fx) - min_val) / (max_val - min_val), label='SFW -- Mokhtari et al. (2018)') 126 | ax2.plot(batch_size * freq * np.arange(len(cb_SAG.trace_fx)), (np.array(cb_SAG.trace_fx) - min_val) / (max_val - min_val), label="SFW -- Négiar et al. (2020)") 127 | ax2.set_ylabel("Relative suboptimality", fontweight="bold") 128 | ax2.set_xlabel("Number of gradient evaluations", fontweight="bold") 129 | ax2.set_xscale("log") 130 | ax2.set_yscale("log") 131 | ax2.grid() 132 | plt.legend() 133 | plt.show() 134 | print("Done.") -------------------------------------------------------------------------------- /examples/frank_wolfe/plot_sparse_benchmark.py: -------------------------------------------------------------------------------- 1 | # python3 2 | """ 3 | Benchmark of Frank-Wolfe variants for sparse logistic regression 4 | ================================================================ 5 | 6 | Comparison of different Frank-Wolfe variants on various 7 | problems with a logistic regression loss (:meth:`copt.utils.LogLoss`) 8 | and a L1 ball constraint (:meth:`copt.utils.L1Ball`). 9 | """ 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import copt as cp 13 | 14 | # .. datasets and their loading functions .. 15 | import copt.constraint 16 | import copt.loss 17 | 18 | datasets = [ 19 | ("Gisette", cp.datasets.load_gisette, 6e3), 20 | ("RCV1", cp.datasets.load_rcv1, 2e4), 21 | ("Madelon", cp.datasets.load_madelon, 20.0), 22 | ("Covtype", cp.datasets.load_covtype, 200.0), 23 | ] 24 | 25 | 26 | variants_fw = [ 27 | ["backtracking", "adaptive step-size"], 28 | ["DR", "Lipschitz step-size"], 29 | ] 30 | 31 | for dataset_title, load_data, alpha in datasets: 32 | plt.figure() 33 | print("Running on the %s dataset" % dataset_title) 34 | 35 | X, y = load_data() 36 | n_samples, n_features = X.shape 37 | 38 | l1_ball = copt.constraint.L1Ball(alpha) 39 | f = copt.loss.LogLoss(X, y) 40 | x0 = np.zeros(n_features) 41 | 42 | for step, label in variants_fw: 43 | 44 | cb = cp.utils.Trace(f) 45 | sol = cp.minimize_frank_wolfe( 46 | f.f_grad, x0, l1_ball.lmo, callback=cb, step=step, lipschitz=f.lipschitz 47 | ) 48 | 49 | plt.plot(cb.trace_time, cb.trace_fx, label=label, markevery=10) 50 | 51 | print("Sparsity of solution: %s" % np.mean(np.abs(sol.x) > 1e-8)) 52 | plt.legend() 53 | plt.xlabel("Time (in seconds)") 54 | plt.ylabel("Objective function") 55 | plt.title(dataset_title) 56 | plt.tight_layout() # otherwise the right y-label is slightly clipped 57 | plt.xlim((0, 0.7 * cb.trace_time[-1])) # for aesthetics 58 | plt.grid() 59 | plt.show() 60 | -------------------------------------------------------------------------------- /examples/frank_wolfe/plot_sparse_benchmark_pairwise.py: -------------------------------------------------------------------------------- 1 | # python3 2 | """ 3 | Benchmark of Pairwise Frank-Wolfe variants for sparse logistic regression 4 | ========================================================================= 5 | 6 | Speed of convergence of different Frank-Wolfe variants on various 7 | problems with a logistic regression loss (:meth:`copt.utils.LogLoss`) 8 | and a L1 ball constraint (:meth:`copt.utils.L1Ball`). 9 | """ 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import copt as cp 13 | 14 | # .. datasets and their loading functions .. 15 | # .. alpha is the regularization parameter .. 16 | # .. which has been chosen to give 10% feature sparsity .. 17 | import copt.constraint 18 | import copt.loss 19 | 20 | datasets = ( 21 | { 22 | "name": "madelon", 23 | "loader": cp.datasets.load_madelon, 24 | "alpha": 1e4, 25 | "max_iter": 5000, 26 | "f_star": 0.0, 27 | }, 28 | { 29 | "name": "gisette", 30 | "loader": cp.datasets.load_gisette, 31 | "alpha": 1e4, 32 | "max_iter": 5000, 33 | "f_star": 2.293654421822428, 34 | }, 35 | { 36 | "name": "covtype", 37 | "loader": cp.datasets.load_covtype, 38 | "alpha": 1e4, 39 | "max_iter": 5000, 40 | "f_star": 0, 41 | }, 42 | { 43 | "name": "RCV1", 44 | "loader": cp.datasets.load_rcv1, 45 | "alpha": 1e3, 46 | "max_iter": 5000, 47 | "f_star": 0.3114744279728717, 48 | }, 49 | ) 50 | 51 | 52 | variants_fw = [ 53 | ["backtracking", "backtracking line-search"], 54 | ["DR", "Lipschitz step-size"], 55 | ] 56 | 57 | for d in datasets: 58 | plt.figure() 59 | print(f"Running on the {d['name']} dataset.") 60 | 61 | X, y = d["loader"]() 62 | print(X.shape) 63 | n_samples, n_features = X.shape 64 | 65 | l1_ball = copt.constraint.L1Ball(d["alpha"]) 66 | f = copt.loss.LogLoss(X, y) 67 | x0 = np.zeros(n_features) 68 | x0[0] = d["alpha"] # start from a (random) vertex 69 | 70 | for step, label in variants_fw: 71 | 72 | cb = cp.utils.Trace(f) 73 | sol = cp.minimize_frank_wolfe( 74 | f.f_grad, 75 | x0, 76 | l1_ball.lmo_pairwise, 77 | variant='pairwise', 78 | x0_rep=(1., 0), 79 | callback=cb, 80 | step=step, 81 | lipschitz=f.lipschitz, 82 | max_iter=d["max_iter"], 83 | verbose=True, 84 | tol=0, 85 | ) 86 | 87 | plt.plot( 88 | cb.trace_time, 89 | np.array(cb.trace_fx) - d["f_star"], 90 | label=label, 91 | markevery=10, 92 | ) 93 | 94 | print("Sparsity of solution: %s" % np.mean(np.abs(sol.x) > 1e-8)) 95 | print(f(sol.x)) 96 | plt.legend() 97 | plt.xlabel("Time (in seconds)") 98 | plt.ylabel("Objective function") 99 | plt.yscale("log") 100 | plt.title(d["name"]) 101 | plt.tight_layout() # otherwise the right y-label is slightly clipped 102 | plt.grid() 103 | plt.show() 104 | -------------------------------------------------------------------------------- /examples/frank_wolfe/plot_vertex_overlap.py: -------------------------------------------------------------------------------- 1 | # python3 2 | """ 3 | Update Direction Overlap in Frank-Wolfe 4 | ======================================== 5 | 6 | This example quantifies how many times the Frank-Wolfe algorithm selects 7 | the same extremal vertex (which will determine the update direction) twice 8 | in a row. Selecting the same vertex twice in a row is symptomatic of a poor 9 | step-size, as it implies that the last two updates could have been replaced 10 | by a single update with larger step-size. 11 | """ 12 | import copt as cp 13 | import matplotlib.pyplot as plt 14 | from matplotlib.ticker import MaxNLocator 15 | import numpy as np 16 | 17 | # datasets and their respective loading functions 18 | import copt.constraint 19 | import copt.loss 20 | 21 | datasets = [ 22 | ("Gisette", cp.datasets.load_gisette), 23 | ("RCV1", cp.datasets.load_rcv1), 24 | ("Madelon", cp.datasets.load_madelon), 25 | ("Covtype", cp.datasets.load_covtype), 26 | ] 27 | 28 | 29 | fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5)) 30 | for ax, (dataset_title, load_data) in zip(axes.ravel(), datasets): 31 | print("Running on the %s dataset" % dataset_title) 32 | 33 | X, y = load_data() 34 | n_samples, n_features = X.shape 35 | 36 | l1_ball = copt.constraint.L1Ball(n_features / 2.0) 37 | f = copt.loss.LogLoss(X, y) 38 | x0 = np.zeros(n_features) 39 | 40 | for i, (step, label) in enumerate( 41 | [["backtracking", "backtracking"], ["DR", "DR step-size"]] 42 | ): 43 | print("Running %s variant" % label) 44 | st_prev = [] 45 | overlap = [] 46 | 47 | def trace(kw): 48 | """Store vertex overlap during execution of the algorithm.""" 49 | s_t = kw["update_direction"] + kw["x"] 50 | if st_prev: 51 | # check if the vertex of this and the previous iterate 52 | # coincide. Since these might be sparse vectors, we use 53 | # sparse.linalg.norm to make the comparison 54 | prev_overlap = overlap[-1] 55 | if np.linalg.norm(st_prev[0] - s_t) == 0: 56 | overlap.append(prev_overlap + 1) 57 | else: 58 | overlap.append(prev_overlap) 59 | st_prev[0] = s_t 60 | else: 61 | overlap.append(0) 62 | st_prev.append(s_t) 63 | 64 | cp.minimize_frank_wolfe( 65 | f.f_grad, 66 | x0, 67 | l1_ball.lmo, 68 | callback=trace, 69 | max_iter=int(1e4), 70 | step=step, 71 | verbose=True, 72 | lipschitz=f.lipschitz, 73 | ) 74 | ax.plot(overlap, label=label) 75 | ax.yaxis.set_major_locator(MaxNLocator(integer=True)) 76 | ax.legend() 77 | ax.set_xlabel("number of iterations") 78 | ax.set_ylabel("LMO overlap") 79 | ax.set_title(dataset_title) 80 | fig.tight_layout() # otherwise the right y-label is slightly clipped 81 | ax.grid() 82 | # plt.legend() 83 | plt.show() 84 | -------------------------------------------------------------------------------- /examples/plot_accelerated.py: -------------------------------------------------------------------------------- 1 | """ 2 | Accelerated gradient descent 3 | ============================ 4 | 5 | Speed of convergence comparison between gradient descent 6 | and Nesterov acceleration on a logistic regression problem. 7 | """ 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import copt as cp 11 | 12 | # .. construct (random) dataset .. 13 | import copt.loss 14 | 15 | n_samples, n_features = 1000, 200 16 | np.random.seed(0) 17 | X = np.random.randn(n_samples, n_features) 18 | y = np.random.rand(n_samples) 19 | 20 | f = copt.loss.LogLoss(X, y) 21 | step_size = 1.0 / f.lipschitz 22 | 23 | cb_pgd = cp.utils.Trace(f) 24 | result_pgd = cp.minimize_proximal_gradient( 25 | f.f_grad, 26 | np.zeros(n_features), 27 | step=lambda x: step_size, 28 | callback=cb_pgd, 29 | tol=0, 30 | jac=True, 31 | accelerated=False, 32 | ) 33 | 34 | cb_apgd = cp.utils.Trace(f) 35 | result_apgd = cp.minimize_proximal_gradient( 36 | f.f_grad, 37 | np.zeros(n_features), 38 | step=lambda x: step_size, 39 | callback=cb_apgd, 40 | tol=0, 41 | jac=True, 42 | accelerated=True, 43 | ) 44 | 45 | 46 | # .. plot the result .. 47 | fmin = min(np.min(cb_pgd.trace_fx), np.min(cb_apgd.trace_fx)) 48 | plt.title("Comparison of full gradient optimizers") 49 | plt.plot(cb_apgd.trace_fx - fmin, lw=4, label="accelerated gradient descent") 50 | plt.plot(cb_pgd.trace_fx - fmin, lw=4, label="gradient descent") 51 | plt.ylabel("Function suboptimality", fontweight="bold") 52 | plt.xlabel("gradient evaluations", fontweight="bold") 53 | plt.yscale("log") 54 | plt.ylim(ymin=1e-16) 55 | plt.xlim((0, 150)) 56 | plt.legend() 57 | plt.grid() 58 | plt.show() 59 | -------------------------------------------------------------------------------- /examples/plot_group_lasso.py: -------------------------------------------------------------------------------- 1 | """ 2 | Group Lasso regularization 3 | ========================== 4 | 5 | This example solves an inverse problem where the ground truth 6 | coefficients (in orange) follow a group structure. In blue are 7 | the recovered coefficients for group lasso with different values 8 | of the regularization parameter. 9 | 10 | 11 | The group lasso regularization enters the optimization through 12 | its proximal operator, which is implemented in copt through the 13 | function prox of object :meth:`copt.utils.GroupL1`. 14 | 15 | """ 16 | import copt as cp 17 | import matplotlib.pyplot as plt 18 | import numpy as np 19 | from scipy import sparse 20 | 21 | import copt.loss 22 | import copt.penalty 23 | 24 | np.random.seed(0) 25 | 26 | # .. generate some data .. 27 | n_samples, n_features = 100, 100 28 | groups = [np.arange(10 * i, 10 * i + 10) for i in range(10)] 29 | 30 | # .. construct a ground truth vector in which .. 31 | # .. group 4 and 5 are nonzero .. 32 | ground_truth = np.zeros(n_features) 33 | ground_truth[groups[4]] = 1 34 | ground_truth[groups[5]] = 0.5 35 | 36 | max_iter = 5000 37 | print("#features", n_features) 38 | 39 | A = sparse.rand(n_samples, n_features, density=0.2) 40 | sigma = 1.0 41 | b = A.dot(ground_truth) + sigma * np.random.randn(n_samples) 42 | 43 | np.random.seed(0) 44 | n_samples = n_features 45 | 46 | # .. compute the step-size .. 47 | f = copt.loss.SquareLoss(A, b) 48 | step_size = 1.0 / f.lipschitz 49 | 50 | # .. run the solver for different values .. 51 | # .. of the regularization parameter beta .. 52 | all_betas = [0, 1e-2, 1e-1, 0.2] 53 | all_trace_ls, all_trace_nols = [], [] 54 | out_img = [] 55 | for i, beta in enumerate(all_betas): 56 | print("beta = %s" % beta) 57 | G1 = copt.penalty.GroupL1(beta, groups) 58 | 59 | def loss(x): 60 | return f(x) + G1(x) 61 | 62 | x0 = np.zeros(n_features) 63 | pgd = cp.minimize_proximal_gradient( 64 | f.f_grad, 65 | x0, 66 | G1.prox, 67 | jac=True, 68 | max_iter=max_iter, 69 | tol=1e-10, 70 | trace_certificate=True, 71 | ) 72 | out_img.append(pgd.x) 73 | 74 | 75 | # .. plot the results .. 76 | fig, ax = plt.subplots(2, 4, sharey=False) 77 | xlim = [0.02, 0.02, 0.1] 78 | markevery = [1000, 1000, 100, 100] 79 | for i, beta in enumerate(all_betas): 80 | ax[0, i].set_title("regularization=%s" % beta) 81 | ax[0, i].set_title("$regularization=%s" % beta) 82 | ax[0, i].plot(out_img[i]) 83 | ax[0, i].plot(ground_truth) 84 | ax[0, i].set_ylim((-0.5, 1.5)) 85 | ax[0, i].set_xticks(()) 86 | ax[0, i].set_yticks(()) 87 | 88 | plot_tos, = ax[1, i].plot( 89 | pgd.trace_certificate, lw=3, marker="o", markevery=20, markersize=10 90 | ) 91 | 92 | ax[1, i].set_xlabel("Iterations") 93 | ax[1, i].set_yscale("log") 94 | ax[1, i].set_ylim((1e-8, None)) 95 | ax[1, i].grid(True) 96 | 97 | 98 | ax[1, 0].set_ylabel("certificate") 99 | plt.show() 100 | -------------------------------------------------------------------------------- /examples/plot_jax_copt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Combining COPT with JAX 3 | ======================= 4 | 5 | This example shows how `JAX `_ 6 | can be used within COPT to compute the gradients of the 7 | objective function. 8 | """ 9 | import jax 10 | from jax import numpy as np 11 | import numpy as onp 12 | import matplotlib.pyplot as plt 13 | from sklearn import datasets 14 | import copt as cp 15 | 16 | # .. construct (random) dataset .. 17 | import copt.penalty 18 | 19 | X, y = datasets.make_regression() 20 | n_samples, n_features = X.shape 21 | 22 | 23 | def loss(w): 24 | """Squared error loss.""" 25 | z = np.dot(X, w) - y 26 | return np.sum(z * z) / n_samples 27 | 28 | 29 | # .. use JAX to compute the gradient of loss value_and_grad .. 30 | # .. returns both the gradient and the objective, which is .. 31 | # .. the format that COPT accepts .. 32 | f_grad = jax.value_and_grad(loss) 33 | 34 | w0 = onp.zeros(n_features) 35 | 36 | l1_ball = copt.penalty.L1Norm(0.1) 37 | cb = cp.utils.Trace(lambda x: loss(x) + l1_ball(x)) 38 | sol = cp.minimize_proximal_gradient( 39 | f_grad, w0, prox=l1_ball.prox, callback=cb, jac=True 40 | ) 41 | plt.plot(cb.trace_fx, lw=3) 42 | plt.yscale("log") 43 | plt.xlabel("# Iterations") 44 | plt.ylabel("Objective value") 45 | plt.grid() 46 | plt.show() 47 | -------------------------------------------------------------------------------- /examples/plot_saga_vs_svrg.py: -------------------------------------------------------------------------------- 1 | """ 2 | SAGA vs SVRG 3 | =========================================== 4 | 5 | A comparison between two variance-reduced stochastic gradient methods: 6 | SAGA (implemented in :func:`copt.minimize_saga`) and SVRG (implemented in 7 | :func:`copt.minimize_svrg`). The problem solved in this case is the sum of a 8 | logistic regression and an L1 norm (sometimes referred to as sparse logistic) 9 | """ 10 | import copt as cp 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | 14 | # .. construct (random) dataset .. 15 | import copt.loss 16 | import copt.penalty 17 | 18 | n_samples, n_features = 1000, 200 19 | np.random.seed(0) 20 | X = np.random.randn(n_samples, n_features) 21 | y = np.random.rand(n_samples) 22 | 23 | # .. objective function and regularizer .. 24 | f = copt.loss.LogLoss(X, y) 25 | g = copt.penalty.L1Norm(1.0 / n_samples) 26 | 27 | # .. callbacks to track progress .. 28 | cb_saga = cp.utils.Trace(lambda x: f(x) + g(x)) 29 | cb_svrg = cp.utils.Trace(lambda x: f(x) + g(x)) 30 | 31 | # .. run the SAGA and SVRG algorithms .. 32 | step_size = 1.0 / (3 * f.max_lipschitz) 33 | result_saga = cp.minimize_saga( 34 | f.partial_deriv, 35 | X, 36 | y, 37 | np.zeros(n_features), 38 | prox=g.prox_factory(n_features), 39 | step_size=step_size, 40 | callback=cb_saga, 41 | tol=0, 42 | max_iter=100, 43 | ) 44 | 45 | result_svrg = cp.minimize_svrg( 46 | f.partial_deriv, 47 | X, 48 | y, 49 | np.zeros(n_features), 50 | prox=g.prox_factory(n_features), 51 | step_size=step_size, 52 | callback=cb_svrg, 53 | tol=0, 54 | max_iter=100, 55 | ) 56 | 57 | 58 | # .. plot the result .. 59 | fmin = min(np.min(cb_saga.trace_fx), np.min(cb_svrg.trace_fx)) 60 | plt.title("Comparison of full gradient optimizers") 61 | plt.plot(cb_saga.trace_fx - fmin, lw=4, label="SAGA") 62 | # .. for SVRG we multiply the number of iterations by two to .. 63 | # .. account for computation of the snapshot gradient .. 64 | plt.plot( 65 | 2 * np.arange(len(cb_svrg.trace_fx)), cb_svrg.trace_fx - fmin, lw=4, label="SVRG" 66 | ) 67 | plt.ylabel("Function suboptimality", fontweight="bold") 68 | plt.xlabel("number of gradient evaluations", fontweight="bold") 69 | plt.yscale("log") 70 | plt.ylim(ymin=1e-16) 71 | plt.xlim((0, 50)) 72 | plt.legend() 73 | plt.grid() 74 | plt.show() 75 | -------------------------------------------------------------------------------- /examples/proximal_splitting/README.txt: -------------------------------------------------------------------------------- 1 | .. _proximal_splitting_examples: 2 | 3 | Proximal Splitting 4 | ------------------ 5 | 6 | Examples that use proximal splitting methods. -------------------------------------------------------------------------------- /examples/proximal_splitting/data/blur_matrix.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openopt/copt/c0d5d46ae709f77b7dc1fc692bbe476aa63f029b/examples/proximal_splitting/data/blur_matrix.npz -------------------------------------------------------------------------------- /examples/proximal_splitting/plot_overlapping_group_lasso.py: -------------------------------------------------------------------------------- 1 | """ 2 | Group lasso with overlap 3 | ======================== 4 | 5 | Comparison of solvers for a least squares with 6 | overlapping group lasso regularization. 7 | 8 | References 9 | ---------- 10 | This example is modeled after the experiments in `Adaptive Three Operator Splitting `_, Appendix E.3. 11 | """ 12 | import copt as cp 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | from sklearn import preprocessing 16 | 17 | import copt.loss 18 | import copt.penalty 19 | 20 | np.random.seed(0) 21 | 22 | n_samples, n_features = 100, 1002 23 | 24 | # .. generate some data .. 25 | # .. the first set of blocks is 26 | groups = [np.arange(8 * i, 8 * i + 10) for i in range(125)] 27 | ground_truth = np.zeros(n_features) 28 | g = np.random.randint(0, len(groups), 10) 29 | for i in g: 30 | ground_truth[groups[i]] = np.random.randn() 31 | 32 | A = np.random.randn(n_samples, n_features) 33 | p = 0.95 # create a matrix with correlations between features 34 | for i in range(1, n_features): 35 | A[:, i] = p * A[:, i] + (1 - p) * A[:, i-1] 36 | A[:, 0] /= np.sqrt(1 - p ** 2) 37 | A = preprocessing.StandardScaler().fit_transform(A) 38 | b = A.dot(ground_truth) + np.random.randn(n_samples) 39 | 40 | # make labels in {0, 1} 41 | b = np.sign(b) 42 | b = (b + 1) // 2 43 | 44 | 45 | # .. compute the step-size .. 46 | max_iter = 5000 47 | f = copt.loss.LogLoss(A, b) 48 | step_size = 1. / f.lipschitz 49 | 50 | # .. run the solver for different values .. 51 | # .. of the regularization parameter beta .. 52 | all_betas = np.logspace(-4, -1, 4) 53 | all_trace_ls, all_trace_nols, all_trace_pdhg_nols, all_trace_pdhg = [], [], [], [] 54 | all_trace_ls_time, all_trace_nols_time, all_trace_pdhg_nols_time, all_trace_pdhg_time = [], [], [], [] 55 | out_img = [] 56 | for i, beta in enumerate(all_betas): 57 | print('beta = %s' % beta) 58 | G1 = copt.penalty.GroupL1(beta, groups[::2]) 59 | G2 = copt.penalty.GroupL1(beta, groups[1::2]) 60 | 61 | def loss(x): 62 | return f(x) + G1(x) + G2(x) 63 | 64 | cb_tosls = cp.utils.Trace() 65 | x0 = np.zeros(n_features) 66 | tos_ls = cp.minimize_three_split( 67 | f.f_grad, x0, G1.prox, G2.prox, step_size=10 * step_size, 68 | max_iter=max_iter, tol=1e-14, verbose=1, 69 | callback=cb_tosls, h_Lipschitz=beta) 70 | trace_ls = np.array([loss(x) for x in cb_tosls.trace_x]) 71 | all_trace_ls.append(trace_ls) 72 | all_trace_ls_time.append(cb_tosls.trace_time) 73 | 74 | cb_tos = cp.utils.Trace() 75 | x0 = np.zeros(n_features) 76 | tos = cp.minimize_three_split( 77 | f.f_grad, x0, G1.prox, G2.prox, 78 | step_size=step_size, 79 | max_iter=max_iter, tol=1e-14, verbose=1, 80 | line_search=True, callback=cb_tos) 81 | trace_nols = np.array([loss(x) for x in cb_tos.trace_x]) 82 | all_trace_nols.append(trace_nols) 83 | all_trace_nols_time.append(cb_tos.trace_time) 84 | out_img.append(tos.x) 85 | 86 | cb_pdhg = cp.utils.Trace() 87 | x0 = np.zeros(n_features) 88 | pdhg = cp.minimize_primal_dual( 89 | f.f_grad, x0, G1.prox, G2.prox, 90 | callback=cb_pdhg, max_iter=max_iter, 91 | step_size=step_size, 92 | step_size2=(1. / step_size) / 2, tol=0, line_search=True) 93 | trace_pdhg = np.array([loss(x) for x in cb_pdhg.trace_x]) 94 | all_trace_pdhg.append(trace_pdhg) 95 | all_trace_pdhg_time.append(cb_pdhg.trace_time) 96 | 97 | cb_pdhg_nols = cp.utils.Trace() 98 | x0 = np.zeros(n_features) 99 | pdhg_nols = cp.minimize_primal_dual( 100 | f.f_grad, x0, G1.prox, G2.prox, 101 | callback=cb_pdhg_nols, max_iter=max_iter, 102 | step_size=step_size, 103 | step_size2=(1. / step_size) / 2, tol=0, line_search=False) 104 | trace_pdhg_nols = np.array([loss(x) for x in cb_pdhg_nols.trace_x]) 105 | all_trace_pdhg_nols.append(trace_pdhg_nols) 106 | all_trace_pdhg_nols_time.append(cb_pdhg_nols.trace_time) 107 | 108 | 109 | # .. plot the results .. 110 | fig, ax = plt.subplots(2, 4, sharey=False) 111 | xlim = [2000, 2000, 1000, 2000] 112 | markevery = [x//5 for x in xlim] 113 | for i, beta in enumerate(all_betas): 114 | ax[0, i].set_title(r'$\lambda=%s$' % beta) 115 | ax[0, i].set_title(r'$\lambda=%s$' % beta) 116 | ax[0, i].plot(out_img[i] / np.max(out_img[i])) 117 | ax[0, i].plot(ground_truth / np.max(ground_truth)) 118 | ax[0, i].set_xticks(()) 119 | ax[0, i].set_yticks(()) 120 | ax[0, i].set_ylim((-0.5, 1.5)) 121 | 122 | fmin = min(np.min(all_trace_ls[i]), np.min(all_trace_nols[i])) 123 | scale = 1. # all_trace_ls[i][0] - fmin 124 | plot_tos, = ax[1, i].plot( 125 | (all_trace_ls[i] - fmin) / scale, '--', 126 | lw=2, marker='o', markevery=markevery[i], 127 | markersize=5) 128 | 129 | plot_nols, = ax[1, i].plot( 130 | (all_trace_nols[i] - fmin) / scale, 131 | lw=2, marker='h', markevery=markevery[i], 132 | markersize=5) 133 | 134 | plot_pdhg, = ax[1, i].plot( 135 | (all_trace_pdhg[i] - fmin) / scale, 136 | lw=2, marker='^', markevery=markevery[i], 137 | markersize=5) 138 | 139 | plot_pdhg_nols, = ax[1, i].plot( 140 | (all_trace_pdhg_nols[i] - fmin) / scale, 141 | lw=2, marker='d', markevery=markevery[i], 142 | markersize=5) 143 | 144 | ax[1, i].set_xlabel('Iterations') 145 | ax[1, i].set_yscale('log') 146 | ax[1, i].set_ylim((1e-10, None)) 147 | ax[1, i].set_xlim((0, xlim[i])) 148 | ax[1, i].grid(True) 149 | 150 | 151 | plt.gcf().subplots_adjust(bottom=0.25) 152 | plt.figlegend( 153 | (plot_tos, plot_nols, plot_pdhg, plot_pdhg_nols), 154 | ('TOS with line search', 'TOS without line search', 'PDHG with line search', 'PDHG without line search'), 'lower center', ncol=2, 155 | scatterpoints=1, frameon=False,) 156 | 157 | ax[1, 0].set_ylabel('Objective minus optimum') 158 | plt.show() 159 | -------------------------------------------------------------------------------- /examples/proximal_splitting/plot_sparse_nuclear_norm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Estimating a sparse and low rank matrix 3 | ======================================= 4 | 5 | """ 6 | import copt.loss 7 | import copt.penalty 8 | 9 | print(__doc__) 10 | import numpy as np 11 | from scipy.sparse import linalg as splinalg 12 | import matplotlib.pyplot as plt 13 | import copt as cp 14 | 15 | # .. Generate synthetic data .. 16 | np.random.seed(1) 17 | 18 | sigma_2 = 0.6 19 | N = 100 20 | d = 20 21 | blocks = np.array([2 * d / 10, 1 * d / 10, 1 * d / 10, 3 * d / 10, 3 * d / 10]).astype( 22 | np.int 23 | ) 24 | epsilon = 10 ** (-15) 25 | 26 | mu = np.zeros(d) 27 | Sigma = np.zeros((d, d)) 28 | blck = 0 29 | for k in range(len(blocks)): 30 | v = 2 * np.random.rand(int(blocks[k]), 1) 31 | v = v * (abs(v) > 0.9) 32 | Sigma[blck : blck + blocks[k], blck : blck + blocks[k]] = np.dot(v, v.T) 33 | blck = blck + blocks[k] 34 | X = np.random.multivariate_normal( 35 | mu, Sigma + epsilon * np.eye(d), N 36 | ) + sigma_2 * np.random.randn(N, d) 37 | Sigma_hat = np.cov(X.T) 38 | 39 | threshold = 1e-5 40 | Sigma[np.abs(Sigma) < threshold] = 0 41 | Sigma[np.abs(Sigma) >= threshold] = 1 42 | 43 | # .. generate some data .. 44 | 45 | max_iter = 5000 46 | 47 | n_features = np.multiply(*Sigma.shape) 48 | n_samples = n_features 49 | print("#features", n_features) 50 | A = np.random.randn(n_samples, n_features) 51 | 52 | sigma = 1.0 53 | b = A.dot(Sigma.ravel()) + sigma * np.random.randn(n_samples) 54 | 55 | # .. compute the step-size .. 56 | s = splinalg.svds(A, k=1, return_singular_vectors=False, tol=1e-3, maxiter=500)[0] 57 | f = copt.loss.HuberLoss(A, b) 58 | step_size = 1.0 / f.lipschitz 59 | 60 | # .. run the solver for different values .. 61 | # .. of the regularization parameter beta .. 62 | all_betas = [0, 1e-3, 1e-2, 1e-1] 63 | all_trace_ls, all_trace_nols, all_trace_pdhg_nols, all_trace_pdhg = [], [], [], [] 64 | all_trace_ls_time, all_trace_nols_time, all_trace_pdhg_nols_time, all_trace_pdhg_time = ( 65 | [], 66 | [], 67 | [], 68 | [], 69 | ) 70 | out_img = [] 71 | for i, beta in enumerate(all_betas): 72 | print("beta = %s" % beta) 73 | G1 = copt.penalty.TraceNorm(beta, Sigma.shape) 74 | G2 = copt.penalty.L1Norm(beta) 75 | 76 | def loss(x): 77 | return f(x) + G1(x) + G2(x) 78 | 79 | cb_tosls = cp.utils.Trace() 80 | x0 = np.zeros(n_features) 81 | tos_ls = cp.minimize_three_split( 82 | f.f_grad, 83 | x0, 84 | G2.prox, 85 | G1.prox, 86 | step_size=5 * step_size, 87 | max_iter=max_iter, 88 | tol=1e-14, 89 | verbose=1, 90 | callback=cb_tosls, 91 | h_Lipschitz=beta, 92 | ) 93 | trace_ls = np.array([loss(x) for x in cb_tosls.trace_x]) 94 | all_trace_ls.append(trace_ls) 95 | all_trace_ls_time.append(cb_tosls.trace_time) 96 | 97 | cb_tos = cp.utils.Trace() 98 | x0 = np.zeros(n_features) 99 | tos = cp.minimize_three_split( 100 | f.f_grad, 101 | x0, 102 | G1.prox, 103 | G2.prox, 104 | step_size=step_size, 105 | max_iter=max_iter, 106 | tol=1e-14, 107 | verbose=1, 108 | line_search=False, 109 | callback=cb_tos, 110 | ) 111 | trace_nols = np.array([loss(x) for x in cb_tos.trace_x]) 112 | all_trace_nols.append(trace_nols) 113 | all_trace_nols_time.append(cb_tos.trace_time) 114 | out_img.append(tos.x) 115 | 116 | # .. plot the results .. 117 | f, ax = plt.subplots(2, 4, sharey=False) 118 | xlim = [0.02, 0.02, 0.1] 119 | for i, beta in enumerate(all_betas): 120 | ax[0, i].set_title(r"$\lambda=%s$" % beta) 121 | ax[0, i].set_title(r"$\lambda=%s$" % beta) 122 | ax[0, i].imshow( 123 | out_img[i].reshape(Sigma.shape), interpolation="nearest", cmap=plt.cm.gray_r 124 | ) 125 | ax[0, i].set_xticks(()) 126 | ax[0, i].set_yticks(()) 127 | 128 | fmin = min(np.min(all_trace_ls[i]), np.min(all_trace_nols[i])) 129 | plot_tos, = ax[1, i].plot( 130 | all_trace_ls[i] - fmin, lw=4, marker="o", markevery=100, markersize=10 131 | ) 132 | 133 | plot_nols, = ax[1, i].plot( 134 | all_trace_nols[i] - fmin, lw=4, marker="h", markevery=100, markersize=10 135 | ) 136 | 137 | ax[1, i].set_xlabel("Iterations") 138 | ax[1, i].set_yscale("log") 139 | ax[1, i].set_ylim((1e-15, None)) 140 | ax[1, i].set_xlim((0, 2000)) 141 | ax[1, i].grid(True) 142 | 143 | 144 | plt.gcf().subplots_adjust(bottom=0.15) 145 | plt.figlegend( 146 | (plot_tos, plot_nols), 147 | ("TOS with line search", "TOS without line search"), 148 | ncol=5, 149 | scatterpoints=1, 150 | loc=(-0.00, -0.0), 151 | frameon=False, 152 | bbox_to_anchor=[0.05, 0.01], 153 | ) 154 | 155 | ax[1, 0].set_ylabel("Objective minus optimum") 156 | plt.show() 157 | -------------------------------------------------------------------------------- /examples/proximal_splitting/plot_tv_deblurring.py: -------------------------------------------------------------------------------- 1 | # python3 2 | """ 3 | Total variation regularization 4 | ============================== 5 | 6 | Comparison of solvers with total variation regularization. 7 | """ 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | from PIL import Image 11 | from scipy import misc 12 | from scipy import sparse 13 | 14 | import copt as cp 15 | 16 | np.random.seed(0) 17 | 18 | img = misc.face(gray=True).astype(float) 19 | # resize 20 | img = np.array(Image.fromarray(img).resize((153, 115))) 21 | img = img.astype(float) / img.max() 22 | 23 | n_rows, n_cols = img.shape 24 | n_features = n_rows * n_cols 25 | n_samples = n_features 26 | max_iter = 2000 27 | 28 | # .. compute blurred and noisy image .. 29 | A = sparse.load_npz("data/blur_matrix.npz") 30 | b = A.dot(img.ravel()) 31 | 32 | np.random.seed(0) 33 | 34 | # .. compute the step-size .. 35 | f = cp.loss.SquareLoss(A, b) 36 | step_size = 1.0 / f.lipschitz 37 | 38 | 39 | def loss(x, pen): 40 | x_mat = x.reshape((n_rows, n_cols)) 41 | tmp1 = np.abs(np.diff(x_mat, axis=0)) 42 | tmp2 = np.abs(np.diff(x_mat, axis=1)) 43 | return f(x) + pen * (tmp1.sum() + tmp2.sum()) 44 | 45 | 46 | # .. run the solver for different values .. 47 | # .. of the regularization parameter beta .. 48 | all_betas = [0, 1e-7, 1e-6] 49 | all_trace_ls, all_trace_nols, all_trace_pdhg, out_img = [], [], [], [] 50 | all_trace_ls_time, all_trace_nols_time, all_trace_pdhg_time = [], [], [] 51 | for i, beta in enumerate(all_betas): 52 | print("Iteration %s, beta %s" % (i, beta)) 53 | 54 | def g_prox(x, gamma, pen=beta): 55 | return cp.tv_prox.prox_tv1d_cols(gamma * pen, x, n_rows, n_cols) 56 | 57 | def h_prox(x, gamma, pen=beta): 58 | return cp.tv_prox.prox_tv1d_rows(gamma * pen, x, n_rows, n_cols) 59 | 60 | cb_adatos = cp.utils.Trace() 61 | adatos = cp.minimize_three_split( 62 | f.f_grad, 63 | np.zeros(n_features), 64 | g_prox, 65 | h_prox, 66 | step_size=step_size, 67 | max_iter=max_iter, 68 | tol=0, 69 | callback=cb_adatos, 70 | h_Lipschitz=beta, 71 | ) 72 | trace_ls = [loss(x, beta) for x in cb_adatos.trace_x] 73 | all_trace_ls.append(trace_ls) 74 | all_trace_ls_time.append(cb_adatos.trace_time) 75 | out_img.append(adatos.x.reshape(img.shape)) 76 | 77 | cb_tos = cp.utils.Trace() 78 | cp.minimize_three_split( 79 | f.f_grad, 80 | np.zeros(n_features), 81 | g_prox, 82 | h_prox, 83 | step_size=step_size, 84 | max_iter=max_iter, 85 | tol=0, 86 | callback=cb_tos, 87 | line_search=False, 88 | ) 89 | trace_nols = [loss(x, beta) for x in cb_tos.trace_x] 90 | all_trace_nols.append(trace_nols) 91 | all_trace_nols_time.append(cb_tos.trace_time) 92 | 93 | cb_pdhg = cp.utils.Trace() 94 | cp.minimize_primal_dual( 95 | f.f_grad, 96 | np.zeros(n_features), 97 | g_prox, 98 | h_prox, 99 | callback=cb_pdhg, 100 | max_iter=max_iter, 101 | step_size=step_size, 102 | step_size2=(1. / step_size) / 2, 103 | line_search=False, 104 | ) 105 | trace_pdhg = np.array([loss(x, beta) for x in cb_pdhg.trace_x]) 106 | all_trace_pdhg.append(trace_pdhg) 107 | all_trace_pdhg_time.append(cb_pdhg.trace_time) 108 | 109 | # .. plot the results .. 110 | f, ax = plt.subplots(2, 3, sharey=False) 111 | xlim = [0.02, 0.02, 0.1] 112 | for i, beta in enumerate(all_betas): 113 | ax[0, i].set_title(r"$\lambda=%s$" % beta) 114 | ax[0, i].imshow(out_img[i], interpolation="nearest", cmap=plt.cm.gray) 115 | ax[0, i].set_xticks(()) 116 | ax[0, i].set_yticks(()) 117 | 118 | fmin = min(np.min(all_trace_ls[i]), np.min(all_trace_pdhg[i])) 119 | scale = all_trace_ls[i][0] - fmin 120 | plot_tos, = ax[1, i].plot( 121 | (all_trace_ls[i] - fmin) / scale, 122 | "--", 123 | lw=2, 124 | marker="o", 125 | markevery=400, 126 | markersize=7, 127 | ) 128 | 129 | plot_tos_nols, = ax[1, i].plot( 130 | (all_trace_nols[i] - fmin) / scale, 131 | lw=2, 132 | marker="<", 133 | markevery=400, 134 | markersize=7, 135 | ) 136 | 137 | plot_pdhg, = ax[1, i].plot( 138 | (all_trace_pdhg[i] - fmin) / scale, 139 | "--", 140 | lw=2, 141 | marker="^", 142 | markevery=400, 143 | markersize=7, 144 | ) 145 | 146 | ax[1, i].set_xlabel("Iterations") 147 | ax[1, i].set_yscale("log") 148 | ax[1, i].set_ylim((1e-14, None)) 149 | ax[1, i].set_xlim((0, 1500)) 150 | ax[1, i].grid(True) 151 | 152 | 153 | plt.gcf().subplots_adjust(bottom=0.25) 154 | plt.figlegend( 155 | (plot_tos, plot_tos_nols, plot_pdhg), 156 | ( 157 | "Adaptive three operator splitting", 158 | "Three operator splitting", 159 | "Primal-dual hybrid gradient", 160 | ), 161 | "lower center", 162 | ncol=2, 163 | scatterpoints=1, 164 | frameon=False, 165 | ) 166 | 167 | ax[1, 0].set_ylabel("Objective minus optimum") 168 | plt.show() 169 | -------------------------------------------------------------------------------- /examples/pytorch/README.txt: -------------------------------------------------------------------------------- 1 | .. _pytorch_examples: 2 | 3 | PyTorch integration 4 | ------------------ 5 | 6 | Examples that optimize PyTorch functions. -------------------------------------------------------------------------------- /examples/pytorch/adversarial_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import copt 4 | from copt.utils_pytorch import make_func_and_grad 5 | 6 | from robustbench.data import load_cifar10 7 | from robustbench.utils import load_model 8 | 9 | import matplotlib.pyplot as plt 10 | 11 | n_examples = 20 12 | data_batch, target_batch = load_cifar10(n_examples=n_examples, data_dir='~/datasets') 13 | 14 | model = load_model("Standard") 15 | criterion = torch.nn.CrossEntropyLoss() 16 | 17 | # Define the constraint set + initial point 18 | alpha = 10. 19 | constraint = copt.constraint.L1Ball(alpha) 20 | 21 | for data, target in zip(data_batch, target_batch): 22 | data, target = data.unsqueeze(0), target.unsqueeze(0) 23 | 24 | # Define the loss function to be minimized, using Pytorch 25 | def loss_fun(delta): 26 | adv_input = data + delta 27 | return -criterion(model(adv_input), target) 28 | 29 | # Change the function to f_grad: returns loss_val, grad in flattened, numpy array 30 | f_grad = make_func_and_grad(loss_fun, data.shape, data.device, dtype=data.dtype) 31 | 32 | img_np = data.cpu().numpy().squeeze().flatten() 33 | 34 | def image_constraint_prox(delta, step_size=None): 35 | """Projects perturbation delta so that x + delta is in the set of images, 36 | i.e. the (0, 1) range.""" 37 | adv_img_np = img_np + delta 38 | delta = adv_img_np.clip(0, 1) - img_np 39 | return delta 40 | 41 | callback = copt.utils.Trace(lambda delta: f_grad(delta)[0]) 42 | 43 | delta0 = np.zeros(data.shape, dtype=float).flatten() 44 | 45 | sol = copt.minimize_three_split(f_grad, delta0, constraint.prox, 46 | image_constraint_prox, callback=callback, 47 | max_iter=50 48 | ) 49 | 50 | fig, ax = plt.subplots() 51 | ax.plot([-loss_val for loss_val in callback.trace_fx], lw=3) 52 | ax.set_yscale("log") 53 | ax.set_xlabel("# Iterations") 54 | ax.set_ylabel("Objective value") 55 | ax.grid() 56 | 57 | plt.show() 58 | 59 | classes = ('plane', 'car', 'bird', 'cat', 60 | 'deer', 'dog', 'frog', 'horse', 61 | 'ship', 'truck') 62 | 63 | img = data.cpu().numpy().squeeze() 64 | perturbation = sol.x.reshape(img.shape) 65 | adv_img = img + perturbation 66 | 67 | img = img.transpose(1, 2, 0) 68 | perturbation = perturbation.transpose(1, 2, 0) 69 | adv_img = adv_img.transpose(1, 2, 0) 70 | 71 | # Project back so that 0 <= x + delta <= 1 72 | adv_img = np.clip(adv_img, 0, 1) 73 | perturbation = adv_img - img 74 | 75 | fig, axes = plt.subplots(ncols=3) 76 | img_ax, pert_ax, adv_img_ax = axes 77 | 78 | output = torch.nn.functional.softmax(model(data), dim=-1) 79 | label = torch.argmax(output) 80 | 81 | pert_tensor = torch.tensor(sol.x, dtype=data.dtype).to(data.device) 82 | pert_tensor = pert_tensor.reshape(data.shape) 83 | adv_output = torch.nn.functional.softmax(model(torch.clamp(data + pert_tensor, 0., 1.)), dim=-1) 84 | adv_label = torch.argmax(adv_output) 85 | 86 | img_ax.set_title(f"Original image: {classes[label]}, p={output[:, label].item():.2f}") 87 | img_ax.imshow(img) 88 | 89 | pert_ax.set_title("Perturbation") 90 | pert_ax.imshow(abs(perturbation)) 91 | 92 | adv_img_ax.set_title(f"Perturbed image: {classes[adv_label]}, p={adv_output[:, adv_label].item():.2f}") 93 | adv_img_ax.imshow(adv_img) 94 | plt.tight_layout() 95 | 96 | plt.show() 97 | -------------------------------------------------------------------------------- /examples/pytorch/adversarial_example_accuracies.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import numpy as np 3 | import torch 4 | 5 | from tqdm import tqdm 6 | 7 | import copt 8 | from copt.utils_pytorch import make_func_and_grad 9 | 10 | from robustbench.data import load_cifar10 11 | from robustbench.utils import load_model 12 | 13 | 14 | n_examples = 10000 15 | data_batch, target_batch = load_cifar10(n_examples=n_examples, data_dir='~/datasets') 16 | 17 | model_name = "Engstrom2019Robustness" 18 | model = load_model(model_name) 19 | criterion = torch.nn.CrossEntropyLoss() 20 | 21 | # Define the constraint set 22 | alpha = 0.5 23 | constraint = copt.constraint.L2Ball(alpha) 24 | 25 | n_correct = 0 26 | n_correct_adv = 0 27 | 28 | 29 | # Define the loss function to be minimized, using Pytorch 30 | def loss_fun(delta, data): 31 | adv_input = data + delta 32 | return -criterion(model(adv_input), target) 33 | 34 | 35 | print(f"Evaluating model {model_name}, on L{constraint.p}Ball({alpha}).") 36 | 37 | for k, (data, target) in tqdm(enumerate(zip(data_batch, target_batch))): 38 | data, target = data.unsqueeze(0), target.unsqueeze(0) 39 | 40 | loss_fun_data = partial(loss_fun, data=data) 41 | # Change the function to f_grad: returns loss_val, grad in flattened, numpy array 42 | f_grad = make_func_and_grad(loss_fun_data, data.shape, data.device, dtype=data.dtype) 43 | 44 | img_np = data.cpu().numpy().squeeze().flatten() 45 | 46 | def image_constraint_prox(delta, step_size=None): 47 | """Projects perturbation delta so that x + delta is in the set of images, 48 | i.e. the (0, 1) range.""" 49 | adv_img_np = img_np + delta 50 | delta = adv_img_np.clip(0, 1) - img_np 51 | return delta 52 | 53 | delta0 = np.zeros(data.shape, dtype=float).flatten() 54 | 55 | callback = copt.utils.Trace(lambda delta: f_grad(delta)[0]) 56 | 57 | sol = copt.minimize_three_split(f_grad, delta0, constraint.prox, 58 | image_constraint_prox, callback=callback, 59 | max_iter=25 60 | ) 61 | label = torch.argmax(model(data), dim=-1) 62 | 63 | pert_tensor = torch.tensor(sol.x, dtype=data.dtype).to(data.device) 64 | pert_tensor = pert_tensor.reshape(data.shape) 65 | adv_label = torch.argmax(model(torch.clamp(data + pert_tensor, 0., 1.)), dim=-1) 66 | 67 | n_correct += (label == target).item() 68 | n_correct_adv += (adv_label == target).item() 69 | 70 | accuracy = n_correct / n_examples 71 | accuracy_adv = n_correct_adv / n_examples 72 | 73 | print(f"Accuracy: {accuracy:.3f}") 74 | print(f"Robust accuracy: {accuracy_adv:.3f}") 75 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "numpy", "scipy", "tqdm", "scikit-learn", "six"] # PEP 518 specifications. 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::PendingDeprecationWarning 4 | ignore::RuntimeWarning 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pytest >= 3.8.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | import io 3 | import setuptools 4 | 5 | CLASSIFIERS = """\ 6 | Development Status :: 5 - Production/Stable 7 | Intended Audience :: Science/Research 8 | Intended Audience :: Developers 9 | License :: OSI Approved 10 | Programming Language :: Python 11 | Programming Language :: Python :: 2 12 | Programming Language :: Python :: 3 13 | Topic :: Software Development 14 | Operating System :: POSIX 15 | Operating System :: Unix 16 | 17 | """ 18 | 19 | setup( 20 | name="copt", 21 | description="Library for composite optimization in Python", 22 | long_description=io.open("README.rst", encoding="utf-8").read(), 23 | version="0.9.1", 24 | author="Fabian Pedregosa", 25 | author_email="f@bianp.net", 26 | url="http://pypi.python.org/pypi/copt", 27 | packages=["copt"], 28 | install_requires=["numpy", "scipy", "tqdm", "scikit-learn", "six"], 29 | classifiers=[_f for _f in CLASSIFIERS.split("\n") if _f], 30 | package_data={"copt": ["data/img1.csv"]}, 31 | license="New BSD License", 32 | ) 33 | -------------------------------------------------------------------------------- /tests/test_frank_wolfe.py: -------------------------------------------------------------------------------- 1 | """Tests for the Frank-Wolfe algorithm.""" 2 | import numpy as np 3 | import pytest 4 | from scipy import optimize 5 | import copt as cp 6 | 7 | np.random.seed(0) 8 | n_samples, n_features = 20, 16 9 | A = np.random.randn(n_samples, n_features) 10 | w = np.random.randn(n_features) 11 | b = A.dot(w) + np.random.randn(n_samples) 12 | 13 | # we will use a logistic loss, which can't have values 14 | # greater than 1 15 | b = np.abs(b / np.max(np.abs(b))) 16 | 17 | LOSS_FUNCS = [cp.loss.LogLoss, cp.loss.SquareLoss] 18 | 19 | 20 | def test_fw_api(): 21 | """Check that FW takes the right arguments and raises the right exceptions.""" 22 | 23 | # test that the algorithm does not fail if x0 24 | # is a tuple 25 | f = cp.loss.LogLoss(A, b, 1.0 / n_samples) 26 | cb = cp.utils.Trace(f) 27 | alpha = 1.0 28 | l1ball = cp.constraint.L1Ball(alpha) 29 | cp.minimize_frank_wolfe( 30 | f.f_grad, 31 | [0] * n_features, 32 | l1ball.lmo, 33 | tol=0, 34 | lipschitz=f.lipschitz, 35 | callback=cb, 36 | ) 37 | 38 | # check that we riase an exception when the DR step-size is used 39 | # but no lipschitz constant is given 40 | with pytest.raises(ValueError): 41 | cp.minimize_frank_wolfe(f.f_grad, [0] * n_features, l1ball.lmo, step="DR") 42 | 43 | 44 | @pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) 45 | @pytest.mark.parametrize("loss_grad", LOSS_FUNCS) 46 | def test_fw_l1(loss_grad, alpha): 47 | """Test result of FW algorithm with L1 constraint.""" 48 | f = loss_grad(A, b, 1.0 / n_samples) 49 | cb = cp.utils.Trace(f) 50 | l1ball = cp.constraint.L1Ball(alpha) 51 | opt = cp.minimize_frank_wolfe( 52 | f.f_grad, 53 | np.zeros(n_features), 54 | l1ball.lmo, 55 | tol=1e-3, 56 | lipschitz=f.lipschitz, 57 | callback=cb, 58 | ) 59 | assert np.isfinite(opt.x).sum() == n_features 60 | 61 | ss = 1 / f.lipschitz 62 | grad = f.f_grad(opt.x)[1] 63 | grad_map = (opt.x - l1ball.prox(opt.x - ss * grad, ss)) / ss 64 | assert np.linalg.norm(grad_map) < 0.3 65 | 66 | 67 | def test_callback(): 68 | """Make sure that the algorithm exists when the callback returns False.""" 69 | 70 | def cb(_): 71 | return False 72 | 73 | l1ball = cp.constraint.L1Ball(1) 74 | f = cp.loss.SquareLoss(A, b) 75 | opt = cp.minimize_frank_wolfe(f.f_grad, np.zeros(n_features), l1ball.lmo, callback=cb) 76 | assert opt.nit < 2 77 | 78 | 79 | def exact_line_search(kw): 80 | 81 | def f_on_line(gamma): 82 | return kw["func_and_grad"](kw["x"] + gamma * kw["update_direction"])[0] 83 | 84 | line_sol = optimize.minimize_scalar(f_on_line, method='bounded', bounds=[0, kw["max_step_size"]]) 85 | return line_sol.x 86 | 87 | 88 | @pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) 89 | @pytest.mark.parametrize("obj", LOSS_FUNCS) 90 | @pytest.mark.parametrize("step", ["DR", "backtracking", "sublinear", exact_line_search]) 91 | def test_fw_backtrack(obj, step, alpha): 92 | """Test FW with different options of the line-search strategy.""" 93 | f = obj(A, b, 1.0 / n_samples) 94 | traceball = cp.constraint.TraceBall(alpha, (4, 4)) 95 | opt = cp.minimize_frank_wolfe( 96 | f.f_grad, 97 | np.zeros(n_features), 98 | traceball.lmo, 99 | tol=0, 100 | lipschitz=f.lipschitz, 101 | step=step, 102 | max_iter=1000, 103 | ) 104 | assert np.isfinite(opt.x).sum() == n_features 105 | 106 | ss = 1 / f.lipschitz 107 | grad = f.f_grad(opt.x)[1] 108 | # this is the proximal mapping, zero at optimum 109 | grad_map = (opt.x - traceball.prox(opt.x - ss * grad, ss)) / ss 110 | assert np.linalg.norm(grad_map) < 0.4 111 | 112 | 113 | @pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) 114 | @pytest.mark.parametrize("obj", LOSS_FUNCS) 115 | @pytest.mark.parametrize("step", ["DR", "backtracking", exact_line_search]) 116 | def test_pairwise_fw(obj, step, alpha): 117 | """Test the Pairwise FW method.""" 118 | f = obj(A, b, 1.0 / n_samples) 119 | 120 | l1ball = cp.constraint.L1Ball(alpha) 121 | x0 = np.zeros(A.shape[1]) 122 | x0[0] = alpha 123 | cb = cp.utils.Trace(f) 124 | opt = cp.minimize_frank_wolfe( 125 | f.f_grad, x0, l1ball.lmo_pairwise, x0_rep=(1., 0), 126 | step=step, lipschitz=f.lipschitz, callback=cb, 127 | variant='pairwise' 128 | ) 129 | assert np.isfinite(opt.x).sum() == n_features 130 | 131 | ss = 1 / f.lipschitz 132 | grad = f.f_grad(opt.x)[1] 133 | # this is the proximal mapping, zero at optimum 134 | grad_map = (opt.x - l1ball.prox(opt.x - ss * grad, ss)) / ss 135 | 136 | assert np.linalg.norm(grad_map) < 0.2 137 | -------------------------------------------------------------------------------- /tests/test_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copt as cp 3 | from scipy import optimize 4 | from scipy import sparse 5 | 6 | import copt.loss 7 | 8 | n_samples, n_features = 100, 10 9 | A_dense = np.random.randn(n_samples, n_features) 10 | b = np.random.uniform(0, 1, size=n_samples) 11 | A_sparse = sparse.rand(n_samples, n_features, density=0.5, format="csr") 12 | 13 | 14 | def test_loss_grad(): 15 | for A in (A_dense, A_sparse): 16 | for loss in [copt.loss.LogLoss, copt.loss.SquareLoss, copt.loss.HuberLoss]: 17 | f = loss(A, b) 18 | err = optimize.check_grad( 19 | f, lambda x: f.f_grad(x)[1], np.random.randn(n_features) 20 | ) 21 | assert err < 1e-6 22 | 23 | 24 | def test_log_hess(): 25 | for A in (A_dense, A_sparse): 26 | f = copt.loss.LogLoss(A, b) 27 | x = np.random.randn(n_features) 28 | Hs = f.hessian_mv(x) 29 | 30 | def obj(x): 31 | return f.f_grad(x)[1][0] 32 | 33 | def grad(x): 34 | return f.hessian_mv(x)(np.eye(x.size)[0]) 35 | 36 | err = optimize.check_grad(obj, grad, np.random.randn(n_features)) 37 | assert err < 1e-6 38 | -------------------------------------------------------------------------------- /tests/test_matmul_speedup.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse as sparse 2 | import numpy as np 3 | import copt as cp 4 | 5 | n_samples, n_features = 1000, 100 6 | n_subset = 5 7 | A_sparse = sparse.rand(n_samples, n_features, density=0.5, format="csr") 8 | x = np.random.rand(n_features) 9 | u = np.random.rand(n_subset) 10 | idx = np.random.choice(n_samples, n_subset) 11 | 12 | 13 | def test_fast_csr_vm(): 14 | res = cp.utils.fast_csr_vm(u, 15 | A_sparse.data, A_sparse.indptr, A_sparse.indices, 16 | n_features, idx) 17 | assert np.allclose(res, cp.utils.safe_sparse_dot(u, A_sparse[idx])) 18 | 19 | 20 | def test_fast_csr_mv(): 21 | res = cp.utils.fast_csr_mv(A_sparse.data, A_sparse.indptr, A_sparse.indices, 22 | x, idx) 23 | assert np.allclose(res, cp.utils.safe_sparse_dot(A_sparse[idx], x)) 24 | -------------------------------------------------------------------------------- /tests/test_penalties.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copt as cp 3 | import copt.constraint 4 | import copt.penalty 5 | from copt import tv_prox 6 | from numpy import testing 7 | import pytest 8 | 9 | proximal_penalties = [ 10 | copt.penalty.L1Norm(1.0), 11 | copt.penalty.GroupL1(1.0, np.array_split(np.arange(16), 5)), 12 | copt.penalty.TraceNorm(1.0, (4, 4)), 13 | copt.constraint.TraceBall(1.0, (4, 4)), 14 | copt.penalty.TotalVariation2D(1.0, (4, 4)), 15 | copt.penalty.FusedLasso(1.0), 16 | ] 17 | 18 | 19 | def test_GroupL1(): 20 | groups = [(0, 1), (2, 3)] 21 | g1 = copt.penalty.GroupL1(1.0, groups) 22 | _, B = g1.prox_factory(5) 23 | assert np.all( 24 | B.toarray() 25 | == np.array( 26 | [ 27 | [1.0, 1.0, 0.0, 0.0, 0.0], 28 | [0.0, 0.0, 1.0, 1.0, 0.0], 29 | [0.0, 0.0, 0.0, 0.0, -1.0], 30 | ] 31 | ) 32 | ) 33 | 34 | groups = [(0, 1), (3, 4)] 35 | g2 = copt.penalty.GroupL1(1.0, groups) 36 | _, B = g2.prox_factory(5) 37 | assert np.all( 38 | B.toarray() 39 | == np.array( 40 | [ 41 | [1.0, 1.0, 0.0, 0.0, 0.0], 42 | [0.0, 0.0, -1.0, 0.0, 0.0], 43 | [0.0, 0.0, 0.0, 1.0, 1.0], 44 | ] 45 | ) 46 | ) 47 | 48 | 49 | # 50 | # for blocks in [[(0, 1), (2, 3)], ]: 51 | # pen = cp.utils.GroupL1(1., blocks) 52 | # counter = 0 53 | # for g in pen.groups: 54 | # for j in g: 55 | # counter += 1 56 | # assert counter == blocks.size 57 | # assert pen.groups 58 | # for g in pen.groups: 59 | # assert np.unique(blocks[g]).size == 1 60 | 61 | 62 | def test_tv1_prox(): 63 | """ 64 | Use the properties of strongly convex functions to test the implementation 65 | of the TV1D proximal operator. In particular, we use the following inequality 66 | applied to the proximal objective function: if f is mu-strongly convex then 67 | 68 | f(x) - f(x^*) >= ||x - x^*||^2 / (2 mu) 69 | 70 | where x^* is the optimum of f. 71 | """ 72 | n_features = 10 73 | gamma = np.random.rand() 74 | epsilon = 1e-10 # account for some numerical errors 75 | 76 | tv_norm = lambda x: np.sum(np.abs(np.diff(x))) 77 | for _ in range(1000): 78 | x = np.random.randn(n_features) 79 | x_next = tv_prox.prox_tv1d(x, gamma) 80 | diff_obj = tv_norm(x) - tv_norm(x_next) 81 | testing.assert_array_less( 82 | ((x - x_next) ** 2).sum() / gamma, (1 + epsilon) * diff_obj 83 | ) 84 | 85 | 86 | def test_tv2_prox(): 87 | """ 88 | similar test, but for 2D total variation penalty. 89 | """ 90 | np.random.seed(0) 91 | n_rows, n_cols = 6, 8 92 | n_features = n_rows * n_cols 93 | gamma = np.random.rand() 94 | epsilon = 0.1 # account for some numerical errors 95 | 96 | def tv_norm(x, n_rows, n_cols): 97 | X = x.reshape((n_rows, n_cols)) 98 | return np.sum(np.abs(np.diff(X, 0))) + np.sum(np.abs(np.diff(X, 1))) 99 | 100 | for nrun in range(20): 101 | x = np.random.randn(n_features) 102 | x_next = tv_prox.prox_tv2d(x, gamma, n_rows, n_cols, tol=1e-10, max_iter=10000) 103 | diff_obj = tv_norm(x, n_rows, n_cols) - tv_norm(x_next, n_rows, n_cols) 104 | testing.assert_array_less( 105 | ((x - x_next) ** 2).sum() / gamma, (1 + epsilon) * diff_obj 106 | ) 107 | 108 | 109 | def test_tv2d_linear_operator(): 110 | n_rows, n_cols = 20, 10 111 | 112 | def TV(w): 113 | img = w.reshape((n_rows, n_cols)) 114 | tmp1 = np.abs(np.diff(img, axis=0)) 115 | tmp2 = np.abs(np.diff(img, axis=1)) 116 | return tmp1.sum() + tmp2.sum() 117 | 118 | L = tv_prox.tv2d_linear_operator(n_rows, n_cols) 119 | x = np.random.randn(n_rows * n_cols) 120 | testing.assert_almost_equal(np.abs(L.dot(x)).sum(), TV(x)) 121 | 122 | 123 | @pytest.mark.parametrize("pen", proximal_penalties) 124 | def test_three_inequality(pen): 125 | """Test the L1 prox using the three point inequality 126 | 127 | The three-point inequality is described e.g., in Lemma 1.4 128 | in "Gradient-Based Algorithms with Applications to Signal 129 | Recovery Problems", Amir Beck and Marc Teboulle 130 | """ 131 | n_features = 16 132 | 133 | for _ in range(10): 134 | z = np.random.randn(n_features) 135 | u = np.random.randn(n_features) 136 | xi = pen.prox(z, 1.0) 137 | 138 | lhs = 2 * (pen(xi) - pen(u)) 139 | rhs = ( 140 | np.linalg.norm(u - z) ** 2 141 | - np.linalg.norm(u - xi) ** 2 142 | - np.linalg.norm(xi - z) ** 2 143 | ) 144 | assert lhs <= rhs, pen 145 | -------------------------------------------------------------------------------- /tests/test_proximal_gradient.py: -------------------------------------------------------------------------------- 1 | """Tests for gradient-based methods.""" 2 | import copt as cp 3 | import numpy as np 4 | import pytest 5 | from scipy import optimize 6 | 7 | import copt.loss 8 | import copt.penalty 9 | 10 | np.random.seed(0) 11 | n_samples, n_features = 20, 10 12 | A = np.random.randn(n_samples, n_features) 13 | w = np.random.randn(n_features) 14 | b = A.dot(w) + np.random.randn(n_samples) 15 | 16 | # we will use a logistic loss, which can't have values 17 | # greater than 1 18 | b = np.abs(b / np.max(np.abs(b))) 19 | 20 | 21 | # the accelerated variant, to pass it as a method parameter 22 | def minimize_accelerated(*args, **kw): 23 | kw["accelerated"] = True 24 | return cp.minimize_proximal_gradient(*args, **kw) 25 | 26 | 27 | loss_funcs = [copt.loss.LogLoss, copt.loss.SquareLoss, copt.loss.HuberLoss] 28 | penalty_funcs = [None, copt.penalty.L1Norm] 29 | 30 | 31 | def test_gradient(): 32 | for _ in range(20): 33 | A = np.random.randn(10, 5) 34 | b = np.random.rand(10) 35 | for loss in loss_funcs: 36 | f_grad = loss(A, b).f_grad 37 | f = lambda x: f_grad(x)[0] 38 | grad = lambda x: f_grad(x)[1] 39 | eps = optimize.check_grad(f, grad, np.random.randn(5)) 40 | assert eps < 0.001 41 | 42 | 43 | def certificate(x, grad_x, prox): 44 | if prox is None: 45 | 46 | def prox_(x, _): 47 | return x 48 | 49 | else: 50 | prox_ = prox 51 | 52 | return np.linalg.norm(x - prox_(x - grad_x, 1)) 53 | 54 | 55 | @pytest.mark.parametrize("accelerated", [True, False]) 56 | @pytest.mark.parametrize("loss", loss_funcs) 57 | @pytest.mark.parametrize("penalty", penalty_funcs) 58 | def test_optimize(accelerated, loss, penalty): 59 | """Test a method on both the line_search and fixed step size strategy.""" 60 | max_iter = 200 61 | for alpha in np.logspace(-1, 3, 3): 62 | obj = loss(A, b, alpha) 63 | if penalty is not None: 64 | prox = penalty(1e-3).prox 65 | else: 66 | prox = None 67 | opt = cp.minimize_proximal_gradient( 68 | obj.f_grad, 69 | np.zeros(n_features), 70 | prox=prox, 71 | jac=True, 72 | step="backtracking", 73 | max_iter=max_iter, 74 | accelerated=accelerated, 75 | ) 76 | grad_x = obj.f_grad(opt.x)[1] 77 | assert certificate(opt.x, grad_x, prox) < 1e-5 78 | 79 | opt_2 = cp.minimize_proximal_gradient( 80 | obj.f_grad, 81 | np.zeros(n_features), 82 | prox=prox, 83 | jac=True, 84 | max_iter=max_iter, 85 | step=lambda x: 1 / obj.lipschitz, 86 | accelerated=accelerated, 87 | ) 88 | grad_2x = obj.f_grad(opt_2.x)[1] 89 | assert certificate(opt_2.x, grad_2x, prox) < 1e-5 90 | 91 | 92 | @pytest.mark.parametrize( 93 | "solver", 94 | [ 95 | cp.minimize_proximal_gradient, 96 | cp.minimize_three_split, 97 | cp.minimize_primal_dual, 98 | minimize_accelerated, 99 | ], 100 | ) 101 | def test_callback(solver): 102 | """Make sure that the algorithm exists when the callback returns False.""" 103 | 104 | def cb(_): 105 | return False 106 | 107 | f = copt.loss.SquareLoss(A, b) 108 | opt = solver(f.f_grad, np.zeros(n_features), callback=cb) 109 | assert opt.nit < 2 110 | 111 | 112 | @pytest.mark.parametrize( 113 | "solver", [cp.minimize_proximal_gradient, minimize_accelerated] 114 | ) 115 | def test_line_search(solver): 116 | """Test the custom line search option.""" 117 | 118 | def ls_wrong(_): 119 | return -10 120 | 121 | ls_loss = copt.loss.SquareLoss(A, b) 122 | 123 | # define a function with unused arguments for the API 124 | def f_grad(x, r1, r2): 125 | return ls_loss.f_grad(x) 126 | 127 | opt = solver( 128 | f_grad, np.zeros(n_features), step=ls_wrong, args=(None, None), jac=True 129 | ) 130 | assert not opt.success 131 | 132 | # Define an exact line search strategy 133 | def exact_ls(kw): 134 | def f_ls(gamma): 135 | x_next = kw["prox"](kw["x"] - gamma * kw["grad_fk"], gamma) 136 | return kw["func_and_grad"](x_next)[0] 137 | 138 | ls_sol = optimize.minimize_scalar(f_ls, bounds=[0, 1], method="bounded") 139 | return ls_sol.x 140 | 141 | opt = solver( 142 | f_grad, np.zeros(n_features), step=exact_ls, args=(None, None), jac=True 143 | ) 144 | assert opt.success 145 | -------------------------------------------------------------------------------- /tests/test_randomized.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | import copt as cp 4 | import copt.loss 5 | import copt.penalty 6 | from copt import randomized 7 | import pytest 8 | 9 | np.random.seed(0) 10 | n_samples, n_features = 20, 10 11 | density = 0.5 12 | A = sparse.random(n_samples, n_features, density=density) 13 | A2 = sparse.random(n_samples, n_features + 1, density=density) 14 | w = np.random.randn(n_features) 15 | b = A.dot(w) + np.random.randn(n_samples) 16 | 17 | # we will use a logistic loss, which can't have values 18 | # greater than 1 19 | b = np.abs(b / np.max(np.abs(b))) 20 | 21 | all_solvers_unconstrained = ( 22 | ["SAGA", cp.minimize_saga, 1e-3], 23 | ["SVRG", cp.minimize_svrg, 1e-3], 24 | ["VRTOS", cp.minimize_vrtos, 1e-3], 25 | ) 26 | 27 | 28 | @pytest.mark.parametrize("name_solver, solver, tol", all_solvers_unconstrained) 29 | def test_optimize(name_solver, solver, tol): 30 | f = copt.loss.LogLoss(A, b) 31 | for alpha in np.logspace(-3, 3, 3): 32 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 33 | opt = solver( 34 | f.partial_deriv, 35 | A, 36 | b, 37 | np.zeros(n_features), 38 | 1 / (3 * L), 39 | alpha=alpha, 40 | max_iter=200, 41 | tol=1e-10, 42 | ) 43 | grad = copt.loss.LogLoss(A, b, alpha).f_grad(opt.x)[1] 44 | assert np.linalg.norm(grad) < tol, name_solver 45 | 46 | 47 | def test_saga_l1(): 48 | alpha = 1.0 / n_samples 49 | f = copt.loss.LogLoss(A, b, alpha) 50 | for beta in np.logspace(-3, 3, 3): 51 | pen = copt.penalty.L1Norm(beta) 52 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 53 | 54 | for solver in [cp.minimize_saga, cp.minimize_svrg]: 55 | opt = solver( 56 | f.partial_deriv, 57 | A, 58 | b, 59 | np.zeros(n_features), 60 | 1 / (3 * L), 61 | alpha=alpha, 62 | max_iter=500, 63 | tol=1e-8, 64 | prox=pen.prox_factory(n_features), 65 | ) 66 | grad = copt.loss.LogLoss(A, b, alpha).f_grad(opt.x)[1] 67 | x = opt.x 68 | ss = 1.0 / L 69 | # check that the gradient mapping vanishes 70 | grad_map = (x - pen.prox(x - ss * grad, ss)) / ss 71 | assert np.linalg.norm(grad_map) < 1e-6 72 | 73 | 74 | def test_vrtos(): 75 | """Test VRTOS with no penalty.""" 76 | alpha = 1.0 / n_samples 77 | f = copt.loss.LogLoss(A, b) 78 | for beta in np.logspace(-3, 3, 3): 79 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 80 | 81 | opt = cp.minimize_vrtos( 82 | f.partial_deriv, 83 | A, 84 | b, 85 | np.zeros(n_features), 86 | 1 / (3 * L), 87 | alpha=alpha, 88 | max_iter=200, 89 | ) 90 | 91 | grad = copt.loss.LogLoss(A, b, alpha).f_grad(opt.x)[1] 92 | assert np.linalg.norm(grad) < 1e-6 93 | 94 | 95 | def test_vrtos_l1(): 96 | alpha = 1.0 / n_samples 97 | f = copt.loss.LogLoss(A, b, alpha) 98 | for beta in np.logspace(-3, 3, 3): 99 | p_1 = copt.penalty.L1Norm(beta) 100 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 101 | 102 | # blocks = np.arange(n_features) 103 | opt_1 = cp.minimize_vrtos( 104 | f.partial_deriv, 105 | A, 106 | b, 107 | np.zeros(n_features), 108 | 1 / (3 * L), 109 | alpha=alpha, 110 | max_iter=200, 111 | prox_1=p_1.prox_factory(n_features), 112 | ) 113 | 114 | opt_2 = cp.minimize_vrtos( 115 | f.partial_deriv, 116 | A, 117 | b, 118 | np.zeros(n_features), 119 | 1 / (3 * L), 120 | alpha=alpha, 121 | max_iter=200, 122 | prox_2=p_1.prox_factory(n_features), 123 | ) 124 | 125 | for x in [opt_1.x, opt_2.x]: 126 | full_prox = copt.penalty.L1Norm(beta) 127 | grad = f.f_grad(x)[1] 128 | ss = 1.0 / L 129 | # check that the gradient mapping vanishes 130 | grad_map = (x - full_prox.prox(x - ss * grad, ss)) / ss 131 | assert np.linalg.norm(grad_map) < 1e-6 132 | 133 | 134 | all_groups = [ 135 | [np.arange(5)], 136 | np.arange(5).reshape((-1, 1)), 137 | [np.arange(5), [5], [6], [7], [8], [9]], 138 | [np.arange(5), np.arange(5, 10)], 139 | ] 140 | 141 | 142 | @pytest.mark.parametrize("groups", all_groups) 143 | def test_gl(groups): 144 | alpha = 1.0 / n_samples 145 | f = copt.loss.LogLoss(A, b, alpha) 146 | for beta in np.logspace(-3, 3, 3): 147 | p_1 = copt.penalty.GroupL1(beta, groups) 148 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 149 | 150 | opt_1 = cp.minimize_vrtos( 151 | f.partial_deriv, 152 | A, 153 | b, 154 | np.zeros(n_features), 155 | 1 / (3 * L), 156 | alpha=alpha, 157 | max_iter=200, 158 | prox_1=p_1.prox_factory(n_features), 159 | ) 160 | 161 | opt_2 = cp.minimize_vrtos( 162 | f.partial_deriv, 163 | A, 164 | b, 165 | np.zeros(n_features), 166 | 1 / (3 * L), 167 | alpha=alpha, 168 | max_iter=200, 169 | prox_2=p_1.prox_factory(n_features), 170 | ) 171 | 172 | opt_3 = cp.minimize_saga( 173 | f.partial_deriv, 174 | A, 175 | b, 176 | np.zeros(n_features), 177 | 1 / (3 * L), 178 | alpha=alpha, 179 | max_iter=200, 180 | prox=p_1.prox_factory(n_features), 181 | ) 182 | 183 | for x in [opt_1.x, opt_2.x, opt_3.x]: 184 | grad = copt.loss.LogLoss(A, b, alpha).f_grad(x)[1] 185 | ss = 1.0 / L 186 | # check that the gradient mapping vanishes 187 | grad_map = (x - p_1.prox(x - ss * grad, ss)) / ss 188 | assert np.linalg.norm(grad_map) < 1e-6 189 | 190 | 191 | def test_vrtos_ogl(): 192 | """Test on overlapping group lasso""" 193 | alpha = 1.0 / n_samples 194 | groups_1 = [np.arange(8)] 195 | groups_2 = [np.arange(5, 10)] 196 | f = copt.loss.LogLoss(A, b, alpha) 197 | for beta in np.logspace(-3, 3, 3): 198 | p_1 = copt.penalty.GroupL1(beta, groups_1) 199 | p_2 = copt.penalty.GroupL1(beta, groups_2) 200 | L = cp.utils.get_max_lipschitz(A, "logloss") + alpha / density 201 | 202 | opt_vrtos = cp.minimize_vrtos( 203 | f.partial_deriv, 204 | A, 205 | b, 206 | np.zeros(n_features), 207 | 1 / (3 * L), 208 | alpha=alpha, 209 | max_iter=200, 210 | prox_1=p_1.prox_factory(n_features), 211 | prox_2=p_2.prox_factory(n_features), 212 | ) 213 | 214 | opt_tos = cp.minimize_three_split( 215 | f.f_grad, np.zeros(n_features), prox_1=p_1.prox, prox_2=p_2.prox 216 | ) 217 | 218 | norm = np.linalg.norm(opt_tos.x) 219 | if norm < 1e-10: 220 | norm = 1 221 | assert np.linalg.norm(opt_vrtos.x - opt_tos.x) / norm < 1e-4 222 | 223 | 224 | @pytest.mark.parametrize("A_data", [A, A2]) 225 | def test_vrtos_fl(A_data): 226 | """Test on overlapping group lasso""" 227 | n_samples, n_features = A_data.shape 228 | alpha = 1.0 / n_samples 229 | f = copt.loss.LogLoss(A_data, b, alpha) 230 | for beta in np.logspace(-3, 3, 3): 231 | pen = copt.penalty.FusedLasso(beta) 232 | L = cp.utils.get_max_lipschitz(A_data, "logloss") + alpha / density 233 | 234 | opt_vrtos = cp.minimize_vrtos( 235 | f.partial_deriv, 236 | A_data, 237 | b, 238 | np.zeros(n_features), 239 | 1 / (3 * L), 240 | alpha=alpha, 241 | max_iter=2000, 242 | prox_1=pen.prox_1_factory(n_features), 243 | prox_2=pen.prox_2_factory(n_features), 244 | tol=0, 245 | ) 246 | 247 | opt_pgd = cp.minimize_proximal_gradient( 248 | f.f_grad, np.zeros(n_features), prox=pen.prox, max_iter=2000, tol=0 249 | ) 250 | 251 | norm = np.linalg.norm(opt_pgd.x) 252 | if norm < 1e-10: 253 | norm = 1 254 | assert np.linalg.norm(opt_vrtos.x - opt_pgd.x) / norm < 1e-4 255 | 256 | # check also the gradient mapping 257 | ss = 1.0 / L 258 | grad = f.f_grad(opt_vrtos.x)[1] 259 | grad_map = (opt_vrtos.x - pen.prox(opt_vrtos.x - ss * grad, ss)) / ss 260 | assert np.linalg.norm(grad_map) < 1e-6 261 | -------------------------------------------------------------------------------- /tests/test_splitting.py: -------------------------------------------------------------------------------- 1 | """Tests for gradient-based methods.""" 2 | import copt as cp 3 | import numpy as np 4 | import pytest 5 | 6 | import copt.loss 7 | import copt.penalty 8 | 9 | np.random.seed(0) 10 | n_samples, n_features = 20, 10 11 | A = np.random.randn(n_samples, n_features) 12 | w = np.random.randn(n_features) 13 | b = A.dot(w) + np.random.randn(n_samples) 14 | 15 | # we will use a logistic loss, which can't have values 16 | # greater than 1 17 | b = np.abs(b / np.max(np.abs(b))) 18 | 19 | all_solvers = ( 20 | ["TOS", cp.minimize_three_split, 1e-12], 21 | ["PDHG", cp.minimize_primal_dual, 1e-5], 22 | ) 23 | 24 | loss_funcs = [copt.loss.LogLoss, copt.loss.SquareLoss, copt.loss.HuberLoss] 25 | penalty_funcs = [(None, None), (copt.penalty.L1Norm, None), (None, copt.penalty.L1Norm)] 26 | 27 | 28 | def _get_prox(penalty): 29 | if penalty is not None: 30 | prox = penalty(1e-3).prox 31 | else: 32 | prox = None 33 | return prox 34 | 35 | 36 | @pytest.mark.parametrize("name_solver, solver, tol", all_solvers) 37 | @pytest.mark.parametrize("loss", loss_funcs) 38 | @pytest.mark.parametrize("penalty", penalty_funcs) 39 | def test_primal_dual_certificate(name_solver, solver, tol, loss, penalty): 40 | """Test a method on both the backtracking and fixed step size strategy.""" 41 | max_iter = 1000 42 | for alpha in np.logspace(-1, 3, 3): 43 | obj = loss(A, b, alpha) 44 | prox_1 = _get_prox(penalty[0]) 45 | prox_2 = _get_prox(penalty[1]) 46 | trace = cp.utils.Trace(obj) 47 | opt = solver( 48 | obj.f_grad, 49 | np.zeros(n_features), 50 | prox_1=prox_1, 51 | prox_2=prox_2, 52 | tol=1e-12, 53 | max_iter=max_iter, 54 | callback=trace, 55 | ) 56 | assert opt.certificate < tol, name_solver 57 | 58 | opt_2 = solver( 59 | obj.f_grad, 60 | np.zeros(n_features), 61 | prox_1=prox_1, 62 | prox_2=prox_2, 63 | max_iter=max_iter, 64 | tol=1e-12, 65 | line_search=False, 66 | step_size=1.0 / obj.lipschitz, 67 | ) 68 | assert opt.certificate < tol, name_solver 69 | assert opt_2.certificate < tol, name_solver 70 | 71 | 72 | @pytest.mark.parametrize("line_search", [False, True]) 73 | def test_PDHG_Lasso(line_search): 74 | # test the PDHG on a 1d-TV problem where we also 75 | loss = copt.loss.SquareLoss(A, b) 76 | alpha = 0.1 77 | L = np.eye(A.shape[1]) # (np.diag(np.ones(A.shape[1]), k=0))[:-1] 78 | opt1 = copt.minimize_primal_dual( 79 | loss.f_grad, 80 | np.zeros(n_features), 81 | prox_1=None, 82 | prox_2=copt.penalty.L1Norm(alpha).prox, 83 | L=L, 84 | tol=1e-14, 85 | line_search=line_search, 86 | step_size=0.4, 87 | ) 88 | 89 | opt2 = copt.minimize_proximal_gradient( 90 | loss.f_grad, 91 | np.zeros(n_features), 92 | prox=copt.penalty.L1Norm(alpha).prox, 93 | tol=1e-12, 94 | ) 95 | 96 | assert np.linalg.norm(opt1.x - opt2.x) / np.linalg.norm(opt1.x) < 1e-3 97 | 98 | 99 | @pytest.mark.parametrize("line_search", [False, True]) 100 | def test_PDHG_FusedLasso(line_search): 101 | """PDHG on a 1d-TV problem (aka FusedLasso).""" 102 | loss = copt.loss.SquareLoss(A, b) 103 | alpha = 0.1 104 | L = (np.diag(np.ones(A.shape[1]), k=0) - np.diag(np.ones(A.shape[1] - 1), k=1))[:-1] 105 | opt1 = copt.minimize_primal_dual( 106 | loss.f_grad, 107 | np.zeros(n_features), 108 | prox_1=None, 109 | prox_2=copt.penalty.L1Norm(alpha).prox, 110 | L=L, 111 | tol=1e-14, 112 | line_search=line_search, 113 | step_size=0.4, 114 | ) 115 | 116 | opt2 = copt.minimize_proximal_gradient( 117 | loss.f_grad, 118 | np.zeros(n_features), 119 | prox=copt.penalty.FusedLasso(alpha).prox, 120 | tol=1e-12, 121 | ) 122 | 123 | assert np.linalg.norm(opt1.x - opt2.x) / np.linalg.norm(opt1.x) < 1e-3 124 | 125 | 126 | @pytest.mark.parametrize("regularization", np.logspace(-5, 1, 4)) 127 | @pytest.mark.parametrize("line_search", [False, True]) 128 | def test_PDHG_TV2D(regularization, line_search): 129 | """PDHG on a 2d-TV problem.""" 130 | 131 | img = np.random.randn(10, 10) 132 | n_rows, n_cols = img.shape 133 | n_feat = n_rows * n_cols 134 | loss = copt.loss.SquareLoss(np.eye(n_feat), img.ravel()) 135 | 136 | def g_prox(x, gamma, pen=regularization): 137 | return cp.tv_prox.prox_tv1d_cols(gamma * pen, x, n_rows, n_cols) 138 | 139 | def h_prox(x, gamma, pen=regularization): 140 | return cp.tv_prox.prox_tv1d_rows(gamma * pen, x, n_rows, n_cols) 141 | 142 | opt1 = copt.minimize_primal_dual( 143 | loss.f_grad, 144 | np.zeros(n_feat), 145 | prox_1=g_prox, 146 | prox_2=h_prox, 147 | tol=1e-14, 148 | line_search=line_search, 149 | #step_size=0.4, 150 | ) 151 | 152 | opt2 = copt.minimize_three_split( 153 | loss.f_grad, 154 | np.zeros(n_feat), 155 | prox_1=g_prox, 156 | prox_2=h_prox, 157 | tol=1e-12, 158 | ) 159 | 160 | assert np.linalg.norm(opt1.x - opt2.x) / np.linalg.norm(opt1.x) < 1e-2 161 | -------------------------------------------------------------------------------- /tests/test_stochastic_fw.py: -------------------------------------------------------------------------------- 1 | """Tests for the Stochastic Frank-Wolfe algorithms.""" 2 | import numpy as np 3 | import pytest 4 | from scipy import optimize, sparse 5 | import copt as cp 6 | import copt.constraint 7 | import copt.loss 8 | 9 | np.random.seed(0) 10 | n_samples, n_features = 20, 16 11 | A = np.random.randn(n_samples, n_features) 12 | w = np.random.randn(n_features) 13 | b = A.dot(w) + np.random.randn(n_samples) 14 | 15 | # we will use a logistic loss, which can't have values 16 | # greater than 1 17 | b = np.abs(b / np.max(np.abs(b))) 18 | 19 | LOSS_FUNCS = [copt.loss.LogLoss] 20 | VARIANTS = ['SAGA', 'SAG', 'MHK', 'LF'] 21 | BATCH_SIZES = [1, 10, n_samples] 22 | 23 | 24 | @pytest.mark.parametrize("variant", VARIANTS) 25 | @pytest.mark.parametrize("batch_size", BATCH_SIZES) 26 | def test_fw_api(variant, batch_size): 27 | """Check that SFW algorithms take the right arguments and raises the right exceptions.""" 28 | 29 | # test that the algorithm does not fail if x0 30 | # is a tuple 31 | f = copt.loss.LogLoss(A, b, 1.0 / n_samples) 32 | cb = cp.utils.Trace(f) 33 | alpha = 1.0 34 | l1ball = copt.constraint.L1Ball(alpha) 35 | cp.randomized.minimize_sfw( 36 | f.partial_deriv, 37 | A, 38 | b, 39 | [0] * n_features, 40 | l1ball.lmo, 41 | batch_size=batch_size, 42 | tol=0, 43 | callback=cb, 44 | variant=variant 45 | ) 46 | 47 | 48 | @pytest.mark.parametrize("variant", VARIANTS) 49 | @pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) 50 | @pytest.mark.parametrize("loss_grad", LOSS_FUNCS) 51 | def test_sfw_l1(variant, loss_grad, alpha): 52 | """Test SFW algorithms with L1 constraint.""" 53 | f = loss_grad(A, b, 1.0 / n_samples) 54 | cb = cp.utils.Trace(f) 55 | l1ball = copt.constraint.L1Ball(alpha) 56 | opt = cp.randomized.minimize_sfw( 57 | f.partial_deriv, 58 | A, 59 | b, 60 | np.zeros(n_features), 61 | l1ball.lmo, 62 | tol=1e-3, 63 | callback=cb, 64 | variant=variant 65 | ) 66 | 67 | 68 | @pytest.mark.parametrize("variant", VARIANTS) 69 | @pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) 70 | @pytest.mark.parametrize("loss_grad", LOSS_FUNCS) 71 | def test_sfw_gap_traceback(variant, loss_grad, alpha): 72 | """Test outputting the FW gap for SFW algorithms.""" 73 | f = loss_grad(A, b, 1.0 / n_samples) 74 | l1ball = copt.constraint.L1Ball(alpha) 75 | 76 | def fw_gap(x): 77 | _, grad = f.f_grad(x) 78 | return l1ball.lmo(-grad, x)[0].dot(-grad) 79 | 80 | class TraceGaps(cp.utils.Trace): 81 | def __init__(self, f=None, freq=1): 82 | super(TraceGaps, self).__init__(f, freq) 83 | self.trace_gaps = [] 84 | 85 | def __call__(self, dl): 86 | self.trace_gaps.append(fw_gap(dl['x'])) 87 | super(TraceGaps, self).__call__(dl) 88 | 89 | cb = TraceGaps(f) 90 | 91 | opt = cp.randomized.minimize_sfw( 92 | f.partial_deriv, 93 | A, 94 | b, 95 | np.zeros(n_features), 96 | l1ball.lmo, 97 | tol=1e-3, 98 | callback=cb, 99 | variant=variant 100 | ) 101 | 102 | 103 | @pytest.mark.parametrize("variant", VARIANTS) 104 | @pytest.mark.parametrize("A", [sparse.random(n_samples, n_features, 0.1, 105 | fmt) 106 | for fmt in ['coo', 'csr', 'csc', 'lil']]) 107 | def test_sfw_sparse(variant, A): 108 | """Check that SFW algorithms run on sparse data matrices.""" 109 | 110 | f = copt.loss.LogLoss(A, b, 1.0 / n_samples) 111 | cb = cp.utils.Trace(f) 112 | alpha = 1.0 113 | l1ball = copt.constraint.L1Ball(alpha) 114 | cp.randomized.minimize_sfw( 115 | f.partial_deriv, 116 | A, 117 | b, 118 | np.zeros(n_features), 119 | l1ball.lmo, 120 | tol=0, 121 | callback=cb, 122 | variant=variant 123 | ) 124 | 125 | --------------------------------------------------------------------------------