├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── fusedmax.png
└── pytorch
    ├── MANIFEST.in
    ├── setup.py
    └── torchsparseattn
        ├── __init__.py
        ├── _fused.pyx
        ├── _fused_jv.pyx
        ├── _isotonic.pyx
        ├── base.py
        ├── fused.py
        ├── isotonic.py
        ├── oscar.py
        ├── sparsemax.py
        ├── test_attention.py
        ├── test_fused.py
        ├── test_oscar.py
        └── test_sparsemax.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: python
 3 | dist: xenial
 4 | 
 5 | python:
 6 |   - "2.7"
 7 |   - "3.7"
 8 | 
 9 | env:
10 |   - TORCH_VERSION=1.0.1
11 |   - TORCH_VERSION=0.4.1
12 | 
13 | cache:
14 |   apt: true
15 |   directories:
16 |     - $HOME/.cache/pip
17 | 
18 | install:
19 | 
20 |   - wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh -O miniconda.sh
21 |   - bash miniconda.sh -b -p $HOME/miniconda
22 |   - export PATH="$HOME/miniconda/bin:$PATH"
23 |   - conda config --set always_yes yes --set changeps1 no
24 |   - conda update -q conda
25 |   - hash -r
26 |   - conda info -a
27 |   - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION
28 |   - source activate testenv
29 |   - conda install -c pytorch pytorch-cpu=$TORCH_VERSION numpy scipy pytest cython
30 | 
31 |   # install package
32 |   - cd pytorch
33 |   - pip install .
34 | 
35 | script:
36 |   - mkdir empty_dir
37 |   - pytest pytest -vs --pyargs torchsparseattn
38 |   - cd .. 
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Vlad Niculae
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse and structured attention mechanisms
 2 | [![Build Status](https://travis-ci.org/vene/sparse-structured-attention.svg?branch=master)](https://travis-ci.org/vene/sparse-structured-attention)
 3 | [![PyPI version](https://badge.fury.io/py/torchsparseattn.svg)](https://badge.fury.io/py/torchsparseattn)
 4 | 
 5 | <p align="center"><img src="fusedmax.png" /></p>
 6 | 
 7 | --------------------------------------------------------------------------------
 8 | 
 9 | Efficient implementation of structured sparsity inducing
10 | attention mechanisms: fusedmax, oscarmax and sparsemax.
11 | 
12 | **Note**: If you are just looking for sparsemax, I recommend the implementation in the [entmax](https://github.com/deep-spin/entmax).
13 | 
14 | Currently available for pytorch >= 0.4.1. (For older versions, use a previous
15 | release of this package.) Requires python >= 2.7, cython, numpy, scipy.
16 | 
17 | Usage example:
18 | 
19 | ```python
20 | 
21 | In [1]: import torch
22 | In [2]: import torchsparseattn
23 | In [3]: a = torch.tensor([1, 2.1, 1.9], dtype=torch.double)
24 | In [4]: lengths = torch.tensor([3])
25 | In [5]: fusedmax = torchsparseattn.Fusedmax(alpha=.1)
26 | In [6]: fusedmax(a, lengths)
27 | Out[6]: tensor([0.0000, 0.5000, 0.5000], dtype=torch.float64)
28 | ```
29 | 
30 | For details, check out our paper:
31 | 
32 | > Vlad Niculae and Mathieu Blondel
33 | > A Regularized Framework for Sparse and Structured Neural Attention
34 | > In: Proceedings of NIPS, 2017. 
35 | > https://arxiv.org/abs/1705.07704 
36 | 
37 | See also:
38 | 
39 | > André F. T. Martins and Ramón Fernandez Astudillo
40 | > From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification
41 | > In: Proceedings of ICML, 2016
42 | > https://arxiv.org/abs/1602.02068
43 | 
44 | > X. Zeng and M. Figueiredo,
45 | > The ordered weighted L1 norm: Atomic formulation, dual norm, and projections.
46 | > eprint http://arxiv.org/abs/1409.4271
47 | 
48 | 


--------------------------------------------------------------------------------
/fusedmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vene/sparse-structured-attention/7003b3deaa513c034c3317a44d5b601f95e9e2c6/fusedmax.png


--------------------------------------------------------------------------------
/pytorch/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include MANIFEST.in
2 | recursive-include torchsparseattn *.c *.h *.cpp *.pyx *.pxd
3 | 


--------------------------------------------------------------------------------
/pytorch/setup.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | from setuptools import setup, find_packages, Extension
 3 | 
 4 | from Cython.Build import cythonize
 5 | 
 6 | extensions = [
 7 |     Extension('torchsparseattn._isotonic',
 8 |               ["torchsparseattn/_isotonic.pyx"],
 9 |               include_dirs=[numpy.get_include()]),
10 |     Extension('torchsparseattn._fused',
11 |               ["torchsparseattn/_fused.pyx"],
12 |               include_dirs=[numpy.get_include()]),
13 |     Extension('torchsparseattn._fused_jv',
14 |               ["torchsparseattn/_fused_jv.pyx"]),
15 | ]
16 | 
17 | extensions = cythonize(extensions)
18 | 
19 | 
20 | setup(name="torchsparseattn",
21 |       version="0.3.dev0",
22 |       description="Sparse structured attention mechanisms for pytorch",
23 |       author="Vlad Niculae",
24 |       author_email="vlad@vene.ro",
25 |       license="BSD 3-clause",
26 |       packages=find_packages(),
27 |       ext_modules=extensions,
28 |       install_requires=['numpy'],
29 |       zip_safe=False,
30 |       classifiers=[
31 |           'Intended Audience :: Science/Research',
32 |           'Intended Audience :: Developers', 'License :: OSI Approved',
33 |           'Programming Language :: C', 'Programming Language :: Python',
34 |           'Topic :: Software Development',
35 |           'Topic :: Scientific/Engineering',
36 |           'Operating System :: Microsoft :: Windows',
37 |           'Operating System :: POSIX', 'Operating System :: Unix',
38 |           'Operating System :: MacOS']
39 | )
40 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/__init__.py:
--------------------------------------------------------------------------------
1 | from .fused import Fusedmax, FusedProxFunction
2 | from .oscar import Oscarmax, OscarProxFunction
3 | from .sparsemax import Sparsemax, SparsemaxFunction
4 | 
5 | __version__ = __VERSION__ = '0.3.dev0'
6 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/_fused.pyx:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | # cython: cdivision=True
  3 | # cython: boundscheck=False
  4 | # cython: wraparound=False
  5 | #
  6 | # Authors: Fabian Pedregosa
  7 | # Bundled file from lightning library
  8 | 
  9 | """
 10 | These are some helper functions to compute the proximal operator of some common penalties
 11 | """
 12 | 
 13 | cimport numpy as np
 14 | from cython cimport floating
 15 | 
 16 | cpdef prox_tv1d(np.ndarray[ndim=1, dtype=floating] w, floating stepsize):
 17 |     """
 18 |     Computes the proximal operator of the 1-dimensional total variation operator.
 19 | 
 20 |     This solves a problem of the form
 21 | 
 22 |          argmin_x TV(x) + (1/(2 stepsize)) ||x - w||^2
 23 | 
 24 |     where TV(x) is the one-dimensional total variation
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     w: array
 29 |         vector of coefficieents
 30 |     stepsize: float
 31 |         step size (sometimes denoted gamma) in proximal objective function
 32 | 
 33 |     References
 34 |     ----------
 35 |     Condat, Laurent. "A direct algorithm for 1D total variation denoising."
 36 |     IEEE Signal Processing Letters (2013)
 37 |     """
 38 |     cdef long width, k, k0, kplus, kminus
 39 |     cdef floating umin, umax, vmin, vmax, twolambda, minlambda
 40 |     width = w.size
 41 | 
 42 |     # /to avoid invalid memory access to input[0] and invalid lambda values
 43 |     if width > 0 and stepsize >= 0:
 44 |         k, k0 = 0, 0			# k: current sample location, k0: beginning of current segment
 45 |         umin = stepsize  # u is the dual variable
 46 |         umax = - stepsize
 47 |         vmin = w[0] - stepsize
 48 |         vmax = w[0] + stepsize	  # bounds for the segment's value
 49 |         kplus = 0
 50 |         kminus = 0 	# last positions where umax=-lambda, umin=lambda, respectively
 51 |         twolambda = 2.0 * stepsize  # auxiliary variable
 52 |         minlambda = -stepsize		# auxiliary variable
 53 |         while True:				# simple loop, the exit test is inside
 54 |             while k >= width-1: 	# we use the right boundary condition
 55 |                 if umin < 0.0:			# vmin is too high -> negative jump necessary
 56 |                     while True:
 57 |                         w[k0] = vmin
 58 |                         k0 += 1
 59 |                         if k0 > kminus:
 60 |                             break
 61 |                     k = k0
 62 |                     kminus = k
 63 |                     vmin = w[kminus]
 64 |                     umin = stepsize
 65 |                     umax = vmin + umin - vmax
 66 |                 elif umax > 0.0:    # vmax is too low -> positive jump necessary
 67 |                     while True:
 68 |                         w[k0] = vmax
 69 |                         k0 += 1
 70 |                         if k0 > kplus:
 71 |                             break
 72 |                     k = k0
 73 |                     kplus = k
 74 |                     vmax = w[kplus]
 75 |                     umax = minlambda
 76 |                     umin = vmax + umax -vmin
 77 |                 else:
 78 |                     vmin += umin / (k-k0+1)
 79 |                     while True:
 80 |                         w[k0] = vmin
 81 |                         k0 += 1
 82 |                         if k0 > k:
 83 |                             break
 84 |                     return
 85 |             umin += w[k + 1] - vmin
 86 |             if umin < minlambda:       # negative jump necessary
 87 |                 while True:
 88 |                     w[k0] = vmin
 89 |                     k0 += 1
 90 |                     if k0 > kminus:
 91 |                         break
 92 |                 k = k0
 93 |                 kminus = k
 94 |                 kplus = kminus
 95 |                 vmin = w[kplus]
 96 |                 vmax = vmin + twolambda
 97 |                 umin = stepsize
 98 |                 umax = minlambda
 99 |             else:
100 |                 umax += w[k + 1] - vmax
101 |                 if umax > stepsize:
102 |                     while True:
103 |                         w[k0] = vmax
104 |                         k0 += 1
105 |                         if k0 > kplus:
106 |                             break
107 |                     k = k0
108 |                     kminus = k
109 |                     kplus = kminus
110 |                     vmax = w[kplus]
111 |                     vmin = vmax - twolambda
112 |                     umin = stepsize
113 |                     umax = minlambda
114 |                 else:                   # no jump necessary, we continue
115 |                     k += 1
116 |                     if umin >= stepsize:		# update of vmin
117 |                         kminus = k
118 |                         vmin += (umin - stepsize) / (kminus - k0 + 1)
119 |                         umin = stepsize
120 |                     if umax <= minlambda:	    # update of vmax
121 |                         kplus = k
122 |                         vmax += (umax + stepsize) / (kplus - k0 + 1)
123 |                         umax = minlambda
124 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/_fused_jv.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | from cython cimport floating
 3 | 
 4 | 
 5 | @cython.boundscheck(False)
 6 | @cython.wraparound(False)
 7 | @cython.cdivision(True)
 8 | def _inplace_fused_prox_jv(floating[::1] y_hat, floating[::1] dout):
 9 |     cdef Py_ssize_t n_features = dout.shape[0]
10 |     cdef Py_ssize_t i, last_ix
11 |     cdef unsigned int n
12 |     cdef floating acc
13 |     for i in range(n_features + 1):
14 |         if i in (0, n_features) or y_hat[i] != y_hat[i - 1]:
15 |             if i > 0:
16 |                 dout[last_ix:i] = acc / n
17 | 
18 |             if i < n_features:
19 |                 last_ix = i
20 |                 acc = dout[i]
21 |                 n = 1
22 | 
23 |         else:
24 |             acc += dout[i]
25 |             n += 1
26 |     return dout
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/_isotonic.pyx:
--------------------------------------------------------------------------------
  1 | # Author: Nelle Varoquaux, Andrew Tulloch, Antony Lee
  2 | 
  3 | # Uses the pool adjacent violators algorithm (PAVA), with the
  4 | # enhancement of searching for the longest decreasing subsequence to
  5 | # pool at each step.
  6 | 
  7 | import numpy as np
  8 | cimport numpy as np
  9 | cimport cython
 10 | from cython cimport floating
 11 | 
 12 | 
 13 | @cython.boundscheck(False)
 14 | @cython.wraparound(False)
 15 | @cython.cdivision(True)
 16 | def _inplace_contiguous_isotonic_regression(floating[::1] y, floating[::1] w):
 17 |     cdef:
 18 |         Py_ssize_t n = y.shape[0], i, k
 19 |         floating prev_y, sum_wy, sum_w
 20 |         Py_ssize_t[::1] target = np.arange(n, dtype=np.intp)
 21 | 
 22 |     # target describes a list of blocks.  At any time, if [i..j] (inclusive) is
 23 |     # an active block, then target[i] := j and target[j] := i.
 24 | 
 25 |     # For "active" indices (block starts):
 26 |     # w[i] := sum{w_orig[j], j=[i..target[i]]}
 27 |     # y[i] := sum{y_orig[j]*w_orig[j], j=[i..target[i]]} / w[i]
 28 | 
 29 |     with nogil:
 30 |         i = 0
 31 |         while i < n:
 32 |             k = target[i] + 1
 33 |             if k == n:
 34 |                 break
 35 |             if y[i] < y[k]:
 36 |                 i = k
 37 |                 continue
 38 |             sum_wy = w[i] * y[i]
 39 |             sum_w = w[i]
 40 |             while True:
 41 |                 # We are within a decreasing subsequence.
 42 |                 prev_y = y[k]
 43 |                 sum_wy += w[k] * y[k]
 44 |                 sum_w += w[k]
 45 |                 k = target[k] + 1
 46 |                 if k == n or prev_y < y[k]:
 47 |                     # Non-singleton decreasing subsequence is finished,
 48 |                     # update first entry.
 49 |                     y[i] = sum_wy / sum_w
 50 |                     w[i] = sum_w
 51 |                     target[i] = k - 1
 52 |                     target[k - 1] = i
 53 |                     if i > 0:
 54 |                         # Backtrack if we can.  This makes the algorithm
 55 |                         # single-pass and ensures O(n) complexity.
 56 |                         i = target[i - 1]
 57 |                     # Otherwise, restart from the same point.
 58 |                     break
 59 |         # Reconstruct the solution.
 60 |         i = 0
 61 |         while i < n:
 62 |             k = target[i] + 1
 63 |             y[i + 1 : k] = y[i]
 64 |             i = k
 65 | 
 66 | 
 67 | @cython.boundscheck(False)
 68 | @cython.wraparound(False)
 69 | @cython.cdivision(True)
 70 | def _make_unique(np.ndarray[dtype=floating] X,
 71 |                  np.ndarray[dtype=floating] y,
 72 |                  np.ndarray[dtype=floating] sample_weights):
 73 |     """Average targets for duplicate X, drop duplicates.
 74 | 
 75 |     Aggregates duplicate X values into a single X value where
 76 |     the target y is a (sample_weighted) average of the individual
 77 |     targets.
 78 | 
 79 |     Assumes that X is ordered, so that all duplicates follow each other.
 80 |     """
 81 |     unique_values = len(np.unique(X))
 82 |     if unique_values == len(X):
 83 |         return X, y, sample_weights
 84 |     cdef np.ndarray[dtype=floating] y_out = np.empty(unique_values)
 85 |     cdef np.ndarray[dtype=floating] x_out = np.empty(unique_values)
 86 |     cdef np.ndarray[dtype=floating] weights_out = np.empty(unique_values)
 87 | 
 88 |     cdef floating current_x = X[0]
 89 |     cdef floating current_y = 0
 90 |     cdef floating current_weight = 0
 91 |     cdef floating y_old = 0
 92 |     cdef int i = 0
 93 |     cdef int current_count = 0
 94 |     cdef int j
 95 |     cdef floating x
 96 |     cdef int n_samples = len(X)
 97 |     for j in range(n_samples):
 98 |         x = X[j]
 99 |         if x != current_x:
100 |             # next unique value
101 |             x_out[i] = current_x
102 |             weights_out[i] = current_weight / current_count
103 |             y_out[i] = current_y / current_weight
104 |             i += 1
105 |             current_x = x
106 |             current_weight = sample_weights[j]
107 |             current_y = y[j] * sample_weights[j]
108 |             current_count = 1
109 |         else:
110 |             current_weight += sample_weights[j]
111 |             current_y += y[j] * sample_weights[j]
112 |             current_count += 1
113 | 
114 |     x_out[i] = current_x
115 |     weights_out[i] = current_weight / current_count
116 |     y_out[i] = current_y / current_weight
117 |     return x_out, y_out, weights_out
118 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/base.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from torch import autograd as ta
 3 | 
 4 | class _BaseBatchProjection(ta.Function):
 5 |     """Applies a sample-wise normalizing projection over a batch."""
 6 | 
 7 |     def forward(self, x, lengths=None):
 8 | 
 9 |         requires_squeeze = False
10 |         if x.dim() == 1:
11 |             x = x.unsqueeze(0)
12 |             requires_squeeze = True
13 | 
14 |         n_samples, max_dim = x.size()
15 | 
16 |         has_lengths = True
17 |         if lengths is None:
18 |             has_lengths = False
19 |             lengths = [max_dim] * n_samples
20 | 
21 |         y_star = x.new()
22 |         y_star.resize_as_(x)
23 |         y_star.zero_()
24 | 
25 |         for i in range(n_samples):
26 |             y_star[i, :lengths[i]] = self.project(x[i, :lengths[i]])
27 | 
28 |         if requires_squeeze:
29 |             y_star = y_star.squeeze()
30 | 
31 |         self.mark_non_differentiable(y_star)
32 |         if has_lengths:
33 |             self.mark_non_differentiable(lengths)
34 |             self.save_for_backward(y_star, lengths)
35 |         else:
36 |             self.save_for_backward(y_star)
37 | 
38 |         return y_star
39 | 
40 |     def backward(self, dout):
41 | 
42 |         if not self.needs_input_grad[0]:
43 |             return None
44 | 
45 |         if len(self.needs_input_grad) > 1 and self.needs_input_grad[1]:
46 |             raise ValueError("Cannot differentiate {} w.r.t. the "
47 |                              "sequence lengths".format(self.__name__))
48 | 
49 |         saved = self.saved_tensors
50 |         if len(saved) == 2:
51 |             y_star, lengths = saved
52 |         else:
53 |             y_star, = saved
54 |             lengths = None
55 | 
56 |         requires_squeeze = False
57 |         if y_star.dim() == 1:
58 |             y_star = y_star.unsqueeze(0)
59 |             dout = dout.unsqueeze(0)
60 |             requires_squeeze = True
61 | 
62 |         n_samples, max_dim = y_star.size()
63 |         din = dout.new()
64 |         din.resize_as_(y_star)
65 |         din.zero_()
66 | 
67 |         if lengths is None:
68 |             lengths = [max_dim] * n_samples
69 | 
70 |         for i in range(n_samples):
71 |             din[i, :lengths[i]] = self.project_jv(dout[i, :lengths[i]],
72 |                                                   y_star[i, :lengths[i]])
73 | 
74 |         if requires_squeeze:
75 |             din = din.squeeze()
76 | 
77 |         return din, None
78 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/fused.py:
--------------------------------------------------------------------------------
  1 | """Fusedmax attention
  2 | 
  3 | Clusters neighboring attention weights into groups with equal weight.
  4 | 
  5 | A Regularized Framework for Sparse and Structured Neural Attention
  6 | Vlad Niculae, Mathieu Blondel
  7 | https://arxiv.org/abs/1705.07704
  8 | """
  9 | 
 10 | from __future__ import division
 11 | 
 12 | import torch
 13 | from torch import nn
 14 | from torch import autograd as ta
 15 | import warnings
 16 | 
 17 | from .base import _BaseBatchProjection
 18 | from .sparsemax import SparsemaxFunction
 19 | from ._fused import prox_tv1d
 20 | 
 21 | 
 22 | def _inplace_fused_prox_jv_slow(y_hat, dout):
 23 |     """not efficient in python for long seqs, but template for a cython impl"""
 24 | 
 25 |     n_features = len(dout)
 26 | 
 27 |     for i in range(n_features + 1):
 28 |         if i in (0, n_features) or y_hat[i] != y_hat[i - 1]:
 29 |             if i > 0:
 30 |                 dout[last_ix:i] = acc / n
 31 | 
 32 |             if i < n_features:
 33 |                 last_ix = i
 34 |                 acc = dout[i]
 35 |                 n = 1
 36 |         else:
 37 |             acc += dout[i]
 38 |             n += 1
 39 |     return dout
 40 | 
 41 | 
 42 | try:
 43 |     from ._fused_jv import _inplace_fused_prox_jv
 44 | except ImportError:
 45 |     warnings.warn("Could not import cython implementation of fused backward "
 46 |                   "pass. Slow implementation used instead.")
 47 |     _inplace_fused_prox_jv = _inplace_fused_prox_jv_slow
 48 | 
 49 | 
 50 | def fused_prox_jv_slow(y_hat, dout):
 51 |     dout = dout.clone()
 52 |     _inplace_fused_prox_jv_slow(y_hat, dout)
 53 |     return dout
 54 | 
 55 | 
 56 | def fused_prox_jv_fast(y_hat, dout):
 57 |     dout = dout.clone()
 58 |     _inplace_fused_prox_jv(y_hat.detach().numpy(), dout.numpy())
 59 |     return dout
 60 | 
 61 | 
 62 | class FusedProxFunction(_BaseBatchProjection):
 63 | 
 64 |     def __init__(self, alpha=1):
 65 |         self.alpha = alpha
 66 | 
 67 |     def project(self, x):
 68 |         x_np = x.detach().numpy().copy()
 69 |         prox_tv1d(x_np, self.alpha)
 70 |         y_hat = torch.from_numpy(x_np)
 71 |         return y_hat
 72 | 
 73 |     def project_jv(self, dout, y_hat):
 74 |         dout = dout.clone()
 75 |         _inplace_fused_prox_jv(y_hat.detach().numpy(), dout.numpy())
 76 |         return dout
 77 | 
 78 | 
 79 | class Fusedmax(nn.Module):
 80 |     def __init__(self, alpha=1):
 81 |         self.alpha = alpha
 82 |         super(Fusedmax, self).__init__()
 83 | 
 84 |     def forward(self, x, lengths=None):
 85 |         fused_prox = FusedProxFunction(self.alpha)
 86 |         sparsemax = SparsemaxFunction()
 87 |         return sparsemax(fused_prox(x, lengths), lengths)
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |     from timeit import timeit
 92 |     torch.manual_seed(1)
 93 | 
 94 |     for dim in (5, 10, 50, 100, 500, 1000):
 95 | 
 96 |         x = torch.randn(dim)
 97 |         x_var = ta.Variable(x, requires_grad=True)
 98 |         y_hat = FusedProxFunction()(x_var).data
 99 |         dout = torch.arange(0, dim)
100 |         print("dimension={}".format(dim))
101 |         print("slow", timeit("fused_prox_jv_slow(y_hat, dout)",
102 |                              globals=globals(),
103 |                              number=10000))
104 |         print("fast", timeit("fused_prox_jv_fast(y_hat, dout)",
105 |                              globals=globals(),
106 |                              number=10000))
107 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/isotonic.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Isotonic Regression that preserves 32bit inputs.
 3 | 
 4 | backported from scikit-learn pull request
 5 | https://github.com/scikit-learn/scikit-learn/pull/9106"""
 6 | 
 7 | import numpy as np
 8 | 
 9 | from ._isotonic import _inplace_contiguous_isotonic_regression
10 | 
11 | 
12 | def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None,
13 |                         increasing=True):
14 |     """Solve the isotonic regression model::
15 | 
16 |         min sum w[i] (y[i] - y_[i]) ** 2
17 | 
18 |         subject to y_min = y_[1] <= y_[2] ... <= y_[n] = y_max
19 | 
20 |     where:
21 |         - y[i] are inputs (real numbers)
22 |         - y_[i] are fitted
23 |         - w[i] are optional strictly positive weights (default to 1.0)
24 | 
25 |     Read more in the :ref:`User Guide <isotonic>`.
26 | 
27 |     Parameters
28 |     ----------
29 |     y : iterable of floating-point values
30 |         The data.
31 | 
32 |     sample_weight : iterable of floating-point values, optional, default: None
33 |         Weights on each point of the regression.
34 |         If None, weight is set to 1 (equal weights).
35 | 
36 |     y_min : optional, default: None
37 |         If not None, set the lowest value of the fit to y_min.
38 | 
39 |     y_max : optional, default: None
40 |         If not None, set the highest value of the fit to y_max.
41 | 
42 |     increasing : boolean, optional, default: True
43 |         Whether to compute ``y_`` is increasing (if set to True) or decreasing
44 |         (if set to False)
45 | 
46 |     Returns
47 |     -------
48 |     y_ : list of floating-point values
49 |         Isotonic fit of y.
50 | 
51 |     References
52 |     ----------
53 |     "Active set algorithms for isotonic regression; A unifying framework"
54 |     by Michael J. Best and Nilotpal Chakravarti, section 3.
55 |     """
56 |     order = np.s_[:] if increasing else np.s_[::-1]
57 |     # y = as_float_array(y)  # avoid sklearn dependency; we always pass arrays
58 |     y = np.array(y[order], dtype=y.dtype)
59 |     if sample_weight is None:
60 |         sample_weight = np.ones(len(y), dtype=y.dtype)
61 |     else:
62 |         sample_weight = np.array(sample_weight[order], dtype=y.dtype)
63 | 
64 |     _inplace_contiguous_isotonic_regression(y, sample_weight)
65 |     if y_min is not None or y_max is not None:
66 |         # Older versions of np.clip don't accept None as a bound, so use np.inf
67 |         if y_min is None:
68 |             y_min = -np.inf
69 |         if y_max is None:
70 |             y_max = np.inf
71 |         np.clip(y, y_min, y_max, y)
72 |     return y[order]
73 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/oscar.py:
--------------------------------------------------------------------------------
  1 | """Oscarmax attention
  2 | 
  3 | Clusters attention weights into groups with equal weight, regardless of index.
  4 | 
  5 | A Regularized Framework for Sparse and Structured Neural Attention
  6 | Vlad Niculae, Mathieu Blondel
  7 | https://arxiv.org/abs/1705.07704
  8 | """
  9 | 
 10 | import numpy as np
 11 | import torch
 12 | from torch import nn
 13 | from torch import autograd as ta
 14 | 
 15 | from .isotonic import isotonic_regression
 16 | from .base import _BaseBatchProjection
 17 | from .sparsemax import SparsemaxFunction
 18 | 
 19 | 
 20 | def oscar_prox_jv(y_hat, dout):
 21 |     y_hat = y_hat.detach().numpy()
 22 |     din = dout.clone().zero_()
 23 |     dout = dout.numpy()
 24 |     din_np = din.numpy()
 25 | 
 26 |     sign = np.sign(y_hat)
 27 |     y_hat = np.abs(y_hat)
 28 | 
 29 |     uniq, inv, counts = np.unique(y_hat, return_inverse=True,
 30 |                                   return_counts=True)
 31 |     n_unique = len(uniq)
 32 |     tmp = np.zeros((n_unique,), dtype=y_hat.dtype)
 33 |     np.add.at(tmp, inv, dout * sign)
 34 |     tmp /= counts
 35 |     tmp.take(inv, mode='clip', out=din_np)
 36 |     din_np *= sign
 37 |     return din
 38 | 
 39 | 
 40 | def prox_owl(v, w):
 41 |     """Proximal operator of the OWL norm dot(w, reversed(sort(v)))
 42 | 
 43 |     Follows description and notation from:
 44 |     X. Zeng, M. Figueiredo,
 45 |     The ordered weighted L1 norm: Atomic formulation, dual norm,
 46 |     and projections.
 47 |     eprint http://arxiv.org/abs/1409.4271
 48 |     """
 49 | 
 50 |     # wlog operate on absolute values
 51 |     v_abs = np.abs(v)
 52 |     ix = np.argsort(v_abs)[::-1]
 53 |     v_abs = v_abs[ix]
 54 |     # project to K+ (monotone non-negative decreasing cone)
 55 |     v_abs = isotonic_regression(v_abs - w, y_min=0, increasing=False)
 56 | 
 57 |     # undo the sorting
 58 |     inv_ix = np.zeros_like(ix)
 59 |     inv_ix[ix] = np.arange(len(v))
 60 |     v_abs = v_abs[inv_ix]
 61 | 
 62 |     return np.sign(v) * v_abs
 63 | 
 64 | 
 65 | def _oscar_weights(alpha, beta, size):
 66 |     w = np.arange(size - 1, -1, -1, dtype=np.float32)
 67 |     w *= beta
 68 |     w += alpha
 69 |     return w
 70 | 
 71 | 
 72 | class OscarProxFunction(_BaseBatchProjection):
 73 |     """Proximal operator of the OSCAR regularizer.
 74 | 
 75 |     ||w||_oscar = alpha ||w||_1 + beta * sum_i<j max { |w_i|, |w_j| }
 76 | 
 77 |     Implemented via the OWL norm with appropriate choice of weights, as
 78 |     described in:
 79 | 
 80 |     X. Zeng, M. Figueiredo,
 81 |     The ordered weighted L1 norm: Atomic formulation, dual norm,
 82 |     and projections.
 83 |     eprint http://arxiv.org/abs/1409.4271
 84 | 
 85 |     Backward pass is described in:
 86 |     V. Niculae, M. Blondel,
 87 |     A Regularized Framework for Sparse and Structured Neural Attention.
 88 |     eprint https://arxiv.org/abs/1705.07704
 89 |     """
 90 | 
 91 |     def __init__(self, alpha=0, beta=1):
 92 |         self.alpha = alpha
 93 |         self.beta = beta
 94 | 
 95 |     def project(self, x):
 96 |         x_np = x.detach().numpy().copy()
 97 |         weights = _oscar_weights(self.alpha, self.beta, x_np.shape[0])
 98 |         y_hat_np = prox_owl(x_np, weights)
 99 |         y_hat = torch.from_numpy(y_hat_np)
100 |         return y_hat
101 | 
102 |     def project_jv(self, dout, y_hat):
103 |         return oscar_prox_jv(y_hat, dout)
104 | 
105 | 
106 | class Oscarmax(nn.Module):
107 |     def __init__(self, beta=1):
108 |         self.beta = beta
109 |         super(Oscarmax, self).__init__()
110 | 
111 |     def forward(self, x, lengths=None):
112 |         oscar_prox = OscarProxFunction(beta=self.beta)
113 |         sparsemax = SparsemaxFunction()
114 |         return sparsemax(oscar_prox(x, lengths), lengths)
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     from timeit import timeit
119 |     torch.manual_seed(1)
120 | 
121 |     for dim in (5, 10, 50, 100, 500, 1000):
122 | 
123 |         x = torch.randn(dim)
124 |         x_var = ta.Variable(x, requires_grad=True)
125 | 
126 |         def _run_backward(x):
127 |             y_hat = OscarProxFunction(beta=0.1)(x)
128 |             val = y_hat.mean()
129 |             val.backward()
130 | 
131 |         print("dimension={}".format(dim))
132 |         print("la", timeit("_run_backward(x_var)",
133 |                            globals=globals(),
134 |                            number=10000))
135 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/sparsemax.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf8
 2 | 
 3 | """
 4 | From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label
 5 | Classification. André F. T. Martins, Ramón Fernandez Astudillo
 6 | In: Proc. of ICML 2016, https://arxiv.org/abs/1602.02068
 7 | """
 8 | 
 9 | from __future__ import division
10 | 
11 | import numpy as np
12 | import torch
13 | from torch import nn
14 | from .base import _BaseBatchProjection
15 | 
16 | 
17 | def project_simplex(v, z=1):
18 |     v_sorted, _ = torch.sort(v, dim=0, descending=True)
19 |     cssv = torch.cumsum(v_sorted, dim=0) - z
20 |     ind = torch.arange(1, 1 + len(v)).to(dtype=v.dtype)
21 |     cond = v_sorted - cssv / ind > 0
22 |     rho = ind.masked_select(cond)[-1]
23 |     tau = cssv.masked_select(cond)[-1] / rho
24 |     w = torch.clamp(v - tau, min=0)
25 |     return w
26 | 
27 | 
28 | def sparsemax_grad(dout, w_star):
29 |     supp = w_star > 0
30 |     masked = dout.masked_select(supp)
31 |     nnz = supp.to(dtype=dout.dtype).sum()
32 |     masked -= masked.sum() / nnz
33 |     out = dout.new(dout.size()).zero_()
34 |     out[supp] = masked
35 |     return(out)
36 | 
37 | 
38 | class SparsemaxFunction(_BaseBatchProjection):
39 | 
40 |     def project(self, x):
41 |         return project_simplex(x)
42 | 
43 |     def project_jv(self, dout, y_star):
44 |         return sparsemax_grad(dout, y_star)
45 | 
46 | 
47 | class Sparsemax(nn.Module):
48 | 
49 |     def forward(self, x, lengths=None):
50 |         sparsemax = SparsemaxFunction()
51 |         return sparsemax(x, lengths)
52 | 
53 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/test_attention.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | from torch.autograd import Variable
 6 | 
 7 | from . import Sparsemax, Fusedmax, Oscarmax
 8 | 
 9 | 
10 | class AttentionRegressor(nn.Module):
11 | 
12 |     def __init__(self, projection, n_features=100):
13 |         super(AttentionRegressor, self).__init__()
14 |         self.projection = projection
15 |         self.attn_template = nn.Parameter(torch.Tensor(n_features))
16 |         self.attn_template.data.uniform_(-0.1, 0.1)
17 | 
18 |     def forward(self, X, lengths):
19 | 
20 |         # compute scores for each input word
21 |         scores = torch.matmul(X, self.attn_template)
22 |         weights = self.projection(scores, lengths)
23 |         weighted_avg = torch.bmm(X.transpose(1, 2),
24 |                                  weights.unsqueeze(-1)).squeeze(-1)
25 |         pred = weighted_avg.sum(dim=1)  # very simple prediction rule
26 |         return pred
27 | 
28 | 
29 | @pytest.mark.parametrize('projection', [Sparsemax(),
30 |                                         Fusedmax(0.1),
31 |                                         Oscarmax(0.01)])
32 | def test_attention(projection):
33 |     n_samples = 20
34 |     max_len = 10
35 |     torch.manual_seed(1)
36 |     n_features = 50
37 | 
38 |     X = torch.zeros(n_samples, max_len, n_features)
39 | 
40 |     # generate lengths in [1, max_len]
41 |     lengths = 1 + (torch.rand(n_samples) * max_len).long()
42 | 
43 |     for i in range(n_samples):
44 |         X[i, :lengths[i], :] = torch.randn(lengths[i], n_features)
45 | 
46 |     X = Variable(X)
47 |     lengths = Variable(lengths)
48 |     targets = Variable(torch.randn(n_samples))
49 | 
50 |     regr = AttentionRegressor(projection, n_features=n_features)
51 |     loss_func = nn.MSELoss()
52 |     optim = torch.optim.SGD(regr.parameters(), lr=0.0001)
53 | 
54 |     pred = regr(X, lengths)
55 | 
56 |     init_obj = loss_func(pred, targets)
57 | 
58 |     for it in range(50):
59 |         optim.zero_grad()
60 |         pred = regr(X, lengths)
61 |         obj = loss_func(pred, targets)
62 |         obj.backward()
63 |         optim.step()
64 | 
65 |     final_obj = obj
66 |     assert final_obj < init_obj
67 |     assert regr.attn_template.grad.size() == (n_features,)
68 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/test_fused.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import pytest
 4 | from numpy.testing import assert_allclose
 5 | import torch
 6 | from torch.autograd import gradcheck, Variable
 7 | 
 8 | from .fused import fused_prox_jv_slow, fused_prox_jv_fast
 9 | from .fused import FusedProxFunction
10 | 
11 | 
12 | def _fused_prox_jacobian(y_hat, dout=None):
13 |     """reference naive implementation: construct the jacobian"""
14 |     dim = y_hat.shape[0]
15 |     groups = torch.zeros(dim)
16 |     J = torch.zeros(dim, dim)
17 |     current_group = 0
18 | 
19 |     for i in range(1, dim):
20 |         if y_hat[i] == y_hat[i - 1]:
21 |             groups[i] = groups[i - 1]
22 |         else:
23 |             current_group += 1
24 |             groups[i] = current_group
25 | 
26 |     for i in range(dim):
27 |         for j in range(dim):
28 |             if groups[i] == groups[j]:
29 |                 n_fused = (groups == groups[i]).sum()
30 |                 J[i, j] = 1 / n_fused.to(y_hat.dtype)
31 | 
32 |     if dout is not None:
33 |         return torch.mv(J, dout)
34 |     else:
35 |         return J
36 | 
37 | 
38 | @pytest.mark.parametrize('alpha', [0.001, 0.01, 0.1, 1])
39 | def test_jv(alpha):
40 | 
41 |     torch.manual_seed(1)
42 |     torch.set_default_tensor_type('torch.DoubleTensor')
43 | 
44 |     for _ in range(30):
45 |         x = Variable(torch.randn(15))
46 |         dout = torch.randn(15)
47 | 
48 |         y_hat = FusedProxFunction(alpha=alpha)(x).data
49 | 
50 | 
51 |         ref = _fused_prox_jacobian(y_hat, dout)
52 |         din_slow = fused_prox_jv_slow(y_hat, dout)
53 |         din_fast = fused_prox_jv_fast(y_hat, dout)
54 |         assert_allclose(ref.numpy(), din_slow.numpy(), atol=1e-5)
55 |         assert_allclose(ref.numpy(), din_fast.numpy(), atol=1e-5)
56 | 
57 | 
58 | @pytest.mark.parametrize('alpha', [0.001, 0.01, 0.1, 1])
59 | def test_finite_diff(alpha):
60 |     torch.manual_seed(1)
61 |     torch.set_default_tensor_type('torch.DoubleTensor')
62 | 
63 |     for _ in range(30):
64 |         x = Variable(torch.randn(20), requires_grad=True)
65 |         func = FusedProxFunction(alpha=alpha)
66 |         assert gradcheck(func, (x,), eps=1e-4, atol=1e-3)
67 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/test_oscar.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import pytest
 4 | from numpy.testing import assert_allclose
 5 | import numpy as np
 6 | import torch
 7 | from torch.autograd import gradcheck, Variable
 8 | 
 9 | from .oscar import OscarProxFunction, oscar_prox_jv
10 | 
11 | 
12 | def _oscar_prox_jacobian(y_star, dout=None):
13 |     y_star = y_star.numpy()
14 |     dim = y_star.shape[0]
15 |     J = torch.zeros(dim, dim)
16 | 
17 |     _, inv, counts = np.unique(np.abs(y_star),
18 |                                return_inverse=True,
19 |                                return_counts=True)
20 | 
21 |     for i in range(dim):
22 |         for j in range(dim):
23 |             if (inv[i] == inv[j] and
24 |                     y_star[i] != 0):
25 |                 J[i, j] = (np.sign(y_star[i]) * np.sign(y_star[j])
26 |                            / counts[inv[i]])
27 |     if dout is not None:
28 |         return torch.mv(J, dout)
29 |     else:
30 |         return J
31 | 
32 | 
33 | @pytest.mark.parametrize('alpha', [0.001, 0.01, 0.1, 1])
34 | @pytest.mark.parametrize('beta', [0.001, 0.01, 0.1, 1])
35 | def test_jv(alpha, beta):
36 | 
37 |     torch.manual_seed(1)
38 |     torch.set_default_tensor_type('torch.DoubleTensor')
39 | 
40 |     for _ in range(30):
41 |         x = Variable(torch.randn(15))
42 |         dout = torch.randn(15)
43 |         y_hat = OscarProxFunction(alpha=alpha, beta=beta)(x).data
44 | 
45 |         ref = _oscar_prox_jacobian(y_hat, dout)
46 |         din = oscar_prox_jv(y_hat, dout)
47 |         assert_allclose(ref.numpy(), din.numpy(), atol=1e-5)
48 | 
49 | 
50 | @pytest.mark.parametrize('alpha', [0.001, 0.01, 0.1, 1])
51 | @pytest.mark.parametrize('beta', [0.001, 0.01, 0.1, 1])
52 | def test_finite_diff(alpha, beta):
53 |     torch.manual_seed(1)
54 |     torch.set_default_tensor_type('torch.DoubleTensor')
55 | 
56 |     for _ in range(30):
57 |         x = Variable(torch.randn(20), requires_grad=True)
58 |         func = OscarProxFunction(alpha, beta=beta)
59 |         assert gradcheck(func, (x,), eps=1e-5, atol=1e-3)
60 | 


--------------------------------------------------------------------------------
/pytorch/torchsparseattn/test_sparsemax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import gradcheck, Variable
 3 | from .sparsemax import SparsemaxFunction
 4 | 
 5 | 
 6 | def test_sparsemax():
 7 | 
 8 |     torch.manual_seed(1)
 9 |     torch.set_default_tensor_type('torch.DoubleTensor')
10 | 
11 |     for _ in range(30):
12 |         func = SparsemaxFunction()
13 |         x = Variable(torch.randn(20), requires_grad=True)
14 |         assert gradcheck(func, (x,), eps=1e-4, atol=1e-3)
15 | 


--------------------------------------------------------------------------------