├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   ├── pr-commands.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
    ├── adabelief.R
    ├── adabound.R
    ├── adahessian.R
    ├── adamw.R
    ├── madgrad.R
    ├── nadam.R
    ├── qhadam.R
    ├── radam.R
    ├── swats.R
    ├── torchopt-package.R
    ├── utils-state.R
    ├── utils-testopt.R
    └── yogi.R
├── README.Rmd
├── README.md
├── codecov.yml
├── inst
    └── WORDLIST
├── man
    ├── figures
    │   ├── README-chunk-label-.gif
    │   ├── README-chunk-label-1.jpeg
    │   ├── README-chunk-label-10.jpeg
    │   ├── README-chunk-label-2.jpeg
    │   ├── README-chunk-label-3.jpeg
    │   ├── README-chunk-label-4.jpeg
    │   ├── README-chunk-label-5.jpeg
    │   ├── README-chunk-label-6.jpeg
    │   ├── README-chunk-label-7.jpeg
    │   ├── README-chunk-label-8.jpeg
    │   ├── README-chunk-label-9.jpeg
    │   ├── README-gif_opt-.gif
    │   ├── README-opt_fun-1.png
    │   ├── README-pressure-1.png
    │   ├── README-test_adabelief-.gif
    │   ├── README-test_adabound-.gif
    │   ├── README-test_adahessian-.gif
    │   ├── README-test_adamw-.gif
    │   ├── README-test_madgrad-.gif
    │   ├── README-test_nadam-.gif
    │   ├── README-test_qhadam-.gif
    │   ├── README-test_radam-.gif
    │   ├── README-test_swats-.gif
    │   └── README-test_yogi-.gif
    ├── optim_adabelief.Rd
    ├── optim_adabound.Rd
    ├── optim_adahessian.Rd
    ├── optim_adamw.Rd
    ├── optim_madgrad.Rd
    ├── optim_nadam.Rd
    ├── optim_qhadam.Rd
    ├── optim_radam.Rd
    ├── optim_swats.Rd
    ├── optim_yogi.Rd
    ├── state-set.Rd
    ├── state.Rd
    ├── test_optim.Rd
    └── torchopt-package.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-optimizers.R
    │   └── test-utils-testopt.R
└── torchopt.Rproj


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^README\.md$
 2 | ^README\.Rmd$
 3 | ^README_cache$
 4 | ^.*\.Rproj$
 5 | ^\.Rproj\.user$
 6 | ^codecov\.yml$
 7 | ^\.github$
 8 | ^CODE_OF_CONDUCT\.md$
 9 | ^contributing.md$
10 | ^README_cache$
11 | ^LICENSE\.md$
12 | ^README_cache$
13 | ^man/figures/*$
14 | ^\.Rprofile$
15 | ^\.RData$
16 | ^\.tmp$
17 | ^CRAN-SUBMISSION$
18 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: ${{ matrix.config.os }}
18 | 
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         config:
25 |           - {os: macOS-latest,   r: 'release'}
26 |           - {os: windows-latest, r: 'release'}
27 | 
28 |           # Use older ubuntu to maximise backward compatibility
29 |           - {os: ubuntu-18.04,   r: 'devel', http-user-agent: 'release'}
30 |           - {os: ubuntu-18.04,   r: 'release'}
31 | 
32 | 
33 |     env:
34 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
35 |       R_KEEP_PKG_SOURCE: yes
36 |       TORCH_TEST: 1
37 |       TORCH_INSTALL: 1
38 | 
39 |     steps:
40 |       - uses: actions/checkout@v2
41 | 
42 |       - uses: r-lib/actions/setup-pandoc@v1
43 | 
44 |       - uses: r-lib/actions/setup-r@v2
45 |         with:
46 |           r-version: ${{ matrix.config.r }}
47 | 
48 |       - uses: r-lib/actions/setup-r-dependencies@v2
49 |         with:
50 |           extra-packages: any::rcmdcheck
51 |           needs: check
52 | 
53 |       - uses: r-lib/actions/check-r-package@v2
54 |         with:
55 |           args: 'c("--no-multiarch", "--no-manual")'
56 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   release:
 7 |     types: [published]
 8 |   workflow_dispatch:
 9 | 
10 | name: pkgdown
11 | 
12 | jobs:
13 |   pkgdown:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 |       TORCH_INSTALL: 1
18 |     steps:
19 |       - uses: actions/checkout@v2
20 | 
21 |       - uses: r-lib/actions/setup-pandoc@v2
22 | 
23 |       - uses: r-lib/actions/setup-r@v2
24 |         with:
25 |           use-public-rspm: true
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v2
28 |         with:
29 |           extra-packages: any::pkgdown
30 |           needs: website
31 | 
32 |       - name: Deploy package
33 |         run: |
34 |           git config --local user.name "$GITHUB_ACTOR"
35 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
36 |           Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
37 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | name: Commands
 8 | 
 9 | jobs:
10 |   document:
11 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
12 |     name: document
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v2
18 | 
19 |       - uses: r-lib/actions/pr-fetch@v1
20 |         with:
21 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
22 | 
23 |       - uses: r-lib/actions/setup-r@v1
24 |         with:
25 |           use-public-rspm: true
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v1
28 |         with:
29 |           extra-packages: roxygen2
30 | 
31 |       - name: Document
32 |         run: Rscript -e 'roxygen2::roxygenise()'
33 | 
34 |       - name: commit
35 |         run: |
36 |           git config --local user.name "$GITHUB_ACTOR"
37 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
38 |           git add man/\* NAMESPACE
39 |           git commit -m 'Document'
40 | 
41 |       - uses: r-lib/actions/pr-push@v1
42 |         with:
43 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
44 | 
45 |   style:
46 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
47 |     name: style
48 |     runs-on: ubuntu-latest
49 |     env:
50 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
51 |     steps:
52 |       - uses: actions/checkout@v2
53 | 
54 |       - uses: r-lib/actions/pr-fetch@v1
55 |         with:
56 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
57 | 
58 |       - uses: r-lib/actions/setup-r@v1
59 | 
60 |       - name: Install dependencies
61 |         run: Rscript -e 'install.packages("styler")'
62 | 
63 |       - name: Style
64 |         run: Rscript -e 'styler::style_pkg()'
65 | 
66 |       - name: commit
67 |         run: |
68 |           git config --local user.name "$GITHUB_ACTOR"
69 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
70 |           git add \*.R
71 |           git commit -m 'Style'
72 | 
73 |       - uses: r-lib/actions/pr-push@v1
74 |         with:
75 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
76 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |       TORCH_INSTALL: 1
17 |       TORCH_TEST: 1
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v2
21 | 
22 |       - uses: r-lib/actions/setup-r@v2
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 | 
28 |       - name: Test coverage
29 |         run: covr::codecov()
30 |         shell: Rscript {0}
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | .Rprofile
23 | 
24 | # produced vignettes
25 | vignettes/*.html
26 | vignettes/*.pdf
27 | 
28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
29 | .httr-oauth
30 | 
31 | # knitr and R markdown default cache directories
32 | *_cache/
33 | /cache/
34 | README_cache/*
35 | README_files/*
36 | 
37 | # Temporary files created by R markdown
38 | *.utf8.md
39 | *.knit.md
40 | 
41 | # R Environment Variables
42 | .Renviron
43 | 
44 | # CRAN comments
45 | cran-comments.md
46 | CRAN-SUBMISSION
47 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity and
 10 | orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 | and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 | community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 | advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 | address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 | professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards
 42 | of acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies
 54 | when an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail
 56 | address, posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at rolf.simoes@inpe.br. 
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series of
 85 | actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or permanent
 92 | ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.0,
118 | available at <https://www.contributor-covenant.org/version/2/0/code_of_conduct.html>.
119 | 
120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
121 | enforcement ladder](https://github.com/mozilla/diversity).
122 | 
123 | [homepage]: https://www.contributor-covenant.org
124 | 
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | <https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.
127 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Type: Package
 2 | Package: torchopt
 3 | Title: Advanced Optimizers for Torch
 4 | Version: 0.1.4
 5 | Authors@R: c(
 6 |     person("Gilberto", "Camara", , "gilberto.camara.inpe@gmail.com", role = c("aut", "cre")),
 7 |     person("Rolf", "Simoes", , "rolf.simoes@inpe.br", role = "aut"),
 8 |     person("Daniel", "Falbel", , "daniel.falbel@gmail.com", role = "aut"),
 9 |     person("Felipe", "Souza", , "felipe.carvalho@inpe.br", role = "aut")
10 |   )
11 | Maintainer: Gilberto Camara <gilberto.camara.inpe@gmail.com>
12 | Description: Optimizers for 'torch' deep learning library. These
13 |     functions include recent results published in the literature and are
14 |     not part of the optimizers offered in 'torch'. Prospective users
15 |     should test these optimizers with their data, since performance
16 |     depends on the specific problem being solved.  The packages includes
17 |     the following optimizers: (a) 'adabelief' by Zhuang et al (2020),
18 |     <arXiv:2010.07468>; (b) 'adabound' by Luo et al.(2019),
19 |     <arXiv:1902.09843>; (c) 'adahessian' by Yao et al.(2021)
20 |     <arXiv:2006.00719>; (d) 'adamw' by Loshchilov & Hutter (2019),
21 |     <arXiv:1711.05101>; (e) 'madgrad' by Defazio and Jelassi (2021),
22 |     <arXiv:2101.11075>; (f) 'nadam' by Dozat (2019),
23 |     <https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf>; (g) 'qhadam' by
24 |     Ma and Yarats(2019), <arXiv:1810.06801>; (h) 'radam' by Liu et al.
25 |     (2019), <arXiv:1908.03265>; (i) 'swats' by Shekar and Sochee (2018),
26 |     <arXiv:1712.07628>; (j) 'yogi' by Zaheer et al.(2019),
27 |     <https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>. 
28 | License: Apache License (>= 2)
29 | URL: https://github.com/e-sensing/torchopt/
30 | Depends: 
31 |     R (>= 4.0.0)
32 | Imports:
33 |     graphics,
34 |     grDevices,
35 |     stats,
36 |     torch
37 | Suggests:
38 |     testthat
39 | ByteCompile: true
40 | Encoding: UTF-8
41 | Language: en-US
42 | Roxygen: list(markdown = TRUE)
43 | RoxygenNote: 7.2.0
44 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(optim_adabelief)
 4 | export(optim_adabound)
 5 | export(optim_adahessian)
 6 | export(optim_adamw)
 7 | export(optim_madgrad)
 8 | export(optim_nadam)
 9 | export(optim_qhadam)
10 | export(optim_radam)
11 | export(optim_swats)
12 | export(optim_yogi)
13 | export(test_optim)
14 | importFrom(grDevices,hcl.colors)
15 | importFrom(graphics,contour)
16 | importFrom(graphics,image)
17 | importFrom(graphics,lines)
18 | importFrom(graphics,points)
19 | importFrom(stats,runif)
20 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # R Implementation of Advanced Optimizers for Torch
 2 | 
 3 | ### What's new in torchopt version 0.1.4
 4 | 
 5 | * fixed errors in DESCRIPTION
 6 | 
 7 | ### What's new in torchopt version 0.1.2
 8 | 
 9 | * adahessian optimizer
10 | * nadam optimizer
11 | * radam optimizer 
12 | * qhadam optimizer
13 | * swats optimizer
14 | 
15 | ### What's new in torchopt version 0.1.1
16 | 
17 | * adabelief optimizer
18 | * adabound optimizer
19 | * adamw optimizer
20 | * madgrad optimizer
21 | * yogi optimizer
22 | 
23 | 


--------------------------------------------------------------------------------
/R/adabelief.R:
--------------------------------------------------------------------------------
  1 | #' @title Adabelief optimizer
  2 | #'
  3 | #' @name optim_adabelief
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
  9 | #'
 10 | #' @description
 11 | #' R implementation of the adabelief optimizer proposed
 12 | #' by Zhuang et al (2020). We used the pytorch implementation
 13 | #' developed by the authors which is available at
 14 | #' https://github.com/jettify/pytorch-optimizer.
 15 | #' Thanks to Nikolay Novik of his work on python optimizers.
 16 | #'
 17 | #' The original implementation is licensed using the Apache-2.0 software license.
 18 | #' This implementation is also licensed using Apache-2.0 license.
 19 | #'
 20 | #' From the abstract by the paper by Zhuang et al (2021):
 21 | #' We propose Adabelief to simultaneously achieve three goals:
 22 | #' fast convergence as in adaptive methods, good generalization as in SGD,
 23 | #' and training stability. The intuition for AdaBelief is to adapt
 24 | #' the stepsize according to the "belief" in the current gradient direction.
 25 | #' Viewing the exponential moving average of the noisy gradient
 26 | #' as the prediction of the gradient at the next time step,
 27 | #' if the observed gradient greatly deviates from the prediction,
 28 | #' we distrust the current observation and take a small step;
 29 | #' if the observed gradient is close to the prediction,
 30 | #' we trust it and take a large step.
 31 | 
 32 | #' @references
 33 | #' Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda,
 34 | #' Nicha Dvornek, Xenophon Papademetris, James S. Duncan.
 35 | #' "Adabelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients",
 36 | #' 34th Conference on Neural Information Processing Systems (NeurIPS 2020),
 37 | #' Vancouver, Canada.
 38 | #' https://arxiv.org/abs/2010.07468
 39 | #'
 40 | #' @param params            List of parameters to optimize.
 41 | #' @param lr                Learning rate (default: 1e-3)
 42 | #' @param betas             Coefficients for computing running averages
 43 | #'                          of gradient and its square (default: (0.9, 0.999))
 44 | #' @param eps               Term added to the denominator to improve numerical
 45 | #'                          stability (default: 1e-16)
 46 | #' @param weight_decay      Weight decay (L2 penalty) (default: 0)
 47 | #' @param weight_decouple   Use decoupled weight decay as is done in AdamW?
 48 | #' @param fixed_decay       This is used when weight_decouple is set as True.
 49 | #'                          When fixed_decay == True, weight decay is
 50 | #'                          W_new = W_old - W_old * decay.
 51 | #'                          When fixed_decay == False, the weight decay is
 52 | #'                          W_new = W_old - W_old * decay * learning_rate.
 53 | #'                          In this case, weight decay decreases with learning rate.
 54 | #' @param rectify           Perform the rectified update similar to RAdam?
 55 | #'
 56 | #' @returns
 57 | #' A torch optimizer object implementing the `step` method.
 58 | #'
 59 | #' @examples
 60 | #' if (torch::torch_is_installed()) {
 61 | 
 62 | #' # function to demonstrate optimization
 63 | #' beale <- function(x, y) {
 64 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 65 | #'  }
 66 | #' # define optimizer
 67 | #' optim <- torchopt::optim_adabelief
 68 | #' # define hyperparams
 69 | #' opt_hparams <- list(lr = 0.01)
 70 | #'
 71 | #' # starting point
 72 | #' x0 <- 3
 73 | #' y0 <- 3
 74 | #' # create tensor
 75 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 76 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 77 | #' # instantiate optimizer
 78 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 79 | #' # run optimizer
 80 | #' steps <- 400
 81 | #' x_steps <- numeric(steps)
 82 | #' y_steps <- numeric(steps)
 83 | #' for (i in seq_len(steps)) {
 84 | #'     x_steps[i] <- as.numeric(x)
 85 | #'     y_steps[i] <- as.numeric(y)
 86 | #'     optim$zero_grad()
 87 | #'     z <- beale(x, y)
 88 | #'     z$backward()
 89 | #'     optim$step()
 90 | #' }
 91 | #' print(paste0("starting value = ", beale(x0, y0)))
 92 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 93 | #' }
 94 | #' @export
 95 | optim_adabelief <- torch::optimizer(
 96 |     "optim_adabelief",
 97 |     initialize = function(params,
 98 |                           lr = 0.001,
 99 |                           betas = c(0.9, 0.999),
100 |                           eps = 1.0e-08,
101 |                           weight_decay = 1.0e-06,
102 |                           weight_decouple = TRUE,
103 |                           fixed_decay = FALSE,
104 |                           rectify = TRUE) {
105 |         if (lr <= 0.0)
106 |             stop("Learning rate must be positive.", call. = FALSE)
107 |         if (eps < 0.0)
108 |             stop("eps must be non-negative.", call. = FALSE)
109 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
110 |             stop("Invalid beta parameter.", call. = FALSE)
111 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
112 |             stop("Invalid beta parameter.", call. = FALSE)
113 |         if (weight_decay < 0)
114 |             stop("Invalid weight_decay value.", call. = FALSE)
115 | 
116 | 
117 |         defaults = list(
118 |             lr           = lr,
119 |             betas        = betas,
120 |             eps          = eps,
121 |             weight_decay = weight_decay
122 |         )
123 |         super$initialize(params, defaults)
124 | 
125 |         self$weight_decouple <-  weight_decouple
126 |         self$rectify <-  rectify
127 |         self$fixed_decay <-  fixed_decay
128 |     },
129 |     step = function(closure = NULL){
130 |         loop_fun <- function(group, param, g, p) {
131 |             if (is.null(param$grad))
132 |                 next
133 |             grad <- param$grad
134 | 
135 |             # Variable initialization
136 |             beta1        <- group[['betas']][[1]]
137 |             beta2        <- group[['betas']][[2]]
138 |             weight_decay <- group[['weight_decay']]
139 |             eps          <- group[["eps"]]
140 |             lr           <- group[['lr']]
141 | 
142 |             # State initialization
143 |             if (length(state(param)) == 0) {
144 |                 state(param) <- list()
145 |                 state(param)[["rho_inf"]] <- 2.0 / (1.0 - beta2) - 1.0
146 |                 state(param)[["step"]] <- 0
147 |                 # Exponential moving average of gradient values
148 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
149 |                 # Exponential moving average of squared gradient values
150 |                 state(param)[["exp_avg_var"]] <- torch::torch_zeros_like(param)
151 |             }
152 |             # Define variables for optimization function
153 |             exp_avg      <- state(param)[["exp_avg"]]
154 |             exp_avg_var  <- state(param)[["exp_avg_var"]]
155 | 
156 | 
157 |             # take one step
158 |             state(param)[["step"]] <- state(param)[["step"]] + 1
159 |             # bias correction
160 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
161 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
162 | 
163 |             # perform weight decay, check if decoupled weight decay
164 |             if (self$weight_decouple) {
165 |                 if (!self$fixed_decay)
166 |                     param$mul_(1.0 - lr * weight_decay)
167 |                 else
168 |                     param$mul_(1.0 - weight_decay)
169 |             } else {
170 |                 if (weight_decay != 0)
171 |                     grad$add_(param, alpha = weight_decay)
172 |             }
173 |             # update the first moment
174 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
175 |             grad_residual <- grad - exp_avg
176 |             # Decay the second moment
177 |             exp_avg_var$mul_(beta2)$addcmul_(grad_residual,
178 |                                              grad_residual,
179 |                                              value = (1 - beta2))
180 | 
181 |             # calculate denominator
182 |             denom <- (exp_avg_var$add_(eps)$sqrt()/sqrt(bias_correction2))$add_(eps)
183 | 
184 |             if (!self$rectify) {
185 |                 # calculate step size
186 |                 step_size <- lr / bias_correction1
187 |                 param$addcdiv_(exp_avg, denom, value = -step_size)
188 |             } else {
189 |                 # calculate rho_t
190 |                 rho_inf <- state(param)[["rho_inf"]]
191 |                 step <- state(param)[["step"]]
192 |                 state(param)[["rho_t"]] <- rho_inf  -
193 |                     (2 * step * beta2 ^ step) /
194 |                     (1.0 - beta2 ^ step)
195 |                 rho_t <- state(param)[["rho_t"]]
196 | 
197 |                 # more conservative since it's an approximated value
198 |                 if (rho_t > 4) {
199 |                     # perform Adam style update if variance is small
200 |                     rt = (
201 |                         (rho_t - 4.0) * (rho_t - 2.0) * rho_inf
202 |                         / (rho_inf - 4.0)
203 |                         / (rho_inf - 2.0)
204 |                         / rho_t
205 |                     )
206 |                     rt = sqrt(rt)
207 |                     step_size <- rt * lr / bias_correction1
208 |                     param$addcdiv_(exp_avg,
209 |                                    denom,
210 |                                    value = -step_size
211 |                     )
212 |                 } else
213 |                     # perform SGD style update
214 |                     param$add_(exp_avg, alpha = -lr)
215 |             }
216 |         }
217 |         private$step_helper(closure, loop_fun)
218 |     }
219 | )
220 | 


--------------------------------------------------------------------------------
/R/adabound.R:
--------------------------------------------------------------------------------
  1 | #' @title Adabound optimizer
  2 | #'
  3 | #' @name optim_adabound
  4 | #'
  5 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  6 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  7 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
  8 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  9 | #'
 10 | #' @description
 11 | #' R implementation of the AdaBound optimizer proposed
 12 | #' by Luo et al.(2019). We used the implementation available at
 13 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py.
 14 | #' Thanks to Nikolay Novik for providing the pytorch code.
 15 | #'
 16 | #' The original implementation is licensed using the Apache-2.0 software license.
 17 | #' This implementation is also licensed using Apache-2.0 license.
 18 | #'
 19 | #' AdaBound is a variant of the Adam stochastic optimizer which is
 20 | #' designed to be more robust to extreme learning rates.
 21 | #' Dynamic bounds are employed on learning rates,
 22 | #' where the lower and upper bound are initialized as zero and
 23 | #' infinity respectively, and they both smoothly converge to a
 24 | #' constant final step size. AdaBound can be regarded as an adaptive
 25 | #' method at the beginning of training, and thereafter it gradually and
 26 | #' smoothly transforms to SGD (or with momentum) as the time step increases.
 27 | #'
 28 | #' @references
 29 | #' Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun,
 30 | #' "Adaptive Gradient Methods with Dynamic Bound of Learning Rate",
 31 | #' International Conference on Learning Representations (ICLR), 2019.
 32 | #' https://arxiv.org/abs/1902.09843
 33 | #'
 34 | #' @param params       List of parameters to optimize.
 35 | #' @param lr           Learning rate (default: 1e-3)
 36 | #' @param betas        Coefficients computing running averages of gradient
 37 | #'                     and its square (default: (0.9, 0.999))
 38 | #' @param final_lr     Final (SGD) learning rate (default: 0.1)
 39 | #' @param gamma        Convergence speed of the bound functions
 40 | #'   (default: 1e-3)
 41 | #' @param eps          Term added to the denominator to improve numerical
 42 | #'   stability (default: 1e-8)
 43 | #' @param weight_decay Weight decay (L2 penalty) (default: 0)
 44 | #'
 45 | #' @returns
 46 | #' A torch optimizer object implementing the `step` method.
 47 | #' @examples
 48 | #' if (torch::torch_is_installed()) {
 49 | 
 50 | #' # function to demonstrate optimization
 51 | #' beale <- function(x, y) {
 52 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 53 | #'  }
 54 | #' # define optimizer
 55 | #' optim <- torchopt::optim_adabound
 56 | #' # define hyperparams
 57 | #' opt_hparams <- list(lr = 0.01)
 58 | #'
 59 | #' # starting point
 60 | #' x0 <- 3
 61 | #' y0 <- 3
 62 | #' # create tensor
 63 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 64 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 65 | #' # instantiate optimizer
 66 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 67 | #' # run optimizer
 68 | #' steps <- 400
 69 | #' x_steps <- numeric(steps)
 70 | #' y_steps <- numeric(steps)
 71 | #' for (i in seq_len(steps)) {
 72 | #'     x_steps[i] <- as.numeric(x)
 73 | #'     y_steps[i] <- as.numeric(y)
 74 | #'     optim$zero_grad()
 75 | #'     z <- beale(x, y)
 76 | #'     z$backward()
 77 | #'     optim$step()
 78 | #' }
 79 | #' print(paste0("starting value = ", beale(x0, y0)))
 80 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 81 | #' }
 82 | #' @export
 83 | optim_adabound <- torch::optimizer(
 84 |     "optim_adabound",
 85 |     initialize = function(params,
 86 |                           lr = 1e-3,
 87 |                           betas = c(0.9, 0.999),
 88 |                           final_lr = 0.1,
 89 |                           gamma = 1e-3,
 90 |                           eps = 1e-8,
 91 |                           weight_decay = 0) {
 92 |         if (lr <= 0.0)
 93 |             stop("Learning rate must be positive.", call. = FALSE)
 94 |         if (eps < 0.0)
 95 |             stop("eps must be non-negative.", call. = FALSE)
 96 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 97 |             stop("Invalid beta parameter.", call. = FALSE)
 98 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
 99 |             stop("Invalid beta parameter.", call. = FALSE)
100 |         if (final_lr < 0.0)
101 |             stop("Learning rate must be positive.", call. = FALSE)
102 |         if (gamma > 1.0 | gamma <= 0.0)
103 |             stop("Invalid gamma parameter.", call. = FALSE)
104 |         if (weight_decay < 0)
105 |             stop("Invalid weight_decay value.", call. = FALSE)
106 | 
107 |         defaults = list(
108 |             lr           = lr,
109 |             betas        = betas,
110 |             final_lr     = final_lr,
111 |             gamma        = gamma,
112 |             eps          = eps,
113 |             weight_decay = weight_decay
114 |         )
115 | 
116 |         self$base_lr <- lr
117 |         super$initialize(params, defaults)
118 |     },
119 |     step = function(closure = NULL) {
120 |         loop_fun <- function(group, param, g, p) {
121 |             if (is.null(param$grad))
122 |                 next
123 |             grad <- param$grad
124 | 
125 |             # State initialization
126 |             if (length(state(param)) == 0) {
127 |                 state(param) <- list()
128 |                 state(param)[["step"]] <- 0
129 |                 # Exponential moving average of gradient values
130 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(
131 |                     param,
132 |                     memory_format = torch::torch_preserve_format()
133 |                 )
134 |                 # Exponential moving average of squared gradient values
135 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(
136 |                     param,
137 |                     memory_format = torch::torch_preserve_format()
138 |                 )
139 |             }
140 |             exp_avg    <- state(param)[["exp_avg"]]
141 |             exp_avg_sq <- state(param)[["exp_avg_sq"]]
142 |             beta1      <- group[['betas']][[1]]
143 |             beta2      <- group[['betas']][[2]]
144 | 
145 |             state(param)[["step"]] <- state(param)[["step"]] + 1
146 | 
147 |             if (group[['weight_decay']] != 0)
148 |                 grad <- grad$add(param, alpha = group[['weight_decay']])
149 | 
150 |             # Decay the first and second moment
151 |             # running average coefficient
152 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
153 |             exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = 1 - beta2)
154 | 
155 |             # bias correction
156 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
157 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
158 |             step_size <- group[['lr']] *
159 |                 sqrt(bias_correction2) / bias_correction1
160 | 
161 |             # Applies bounds on actual learning rate
162 |             # lr_scheduler cannot affect final_lr, this is a workaround to
163 |             # apply lr decay
164 |             final_lr <-  group[['final_lr']] * group[['lr']] / self$base_lr
165 |             lower_bound <- final_lr *
166 |                 (1 - 1 / (group[['gamma']] * state(param)[['step']] + 1))
167 |             upper_bound <- final_lr *
168 |                 (1 + 1 / (group[['gamma']] * state(param)[['step']]))
169 | 
170 |             # calculate denominator
171 |             denom = exp_avg_sq$sqrt()$add_(group[['eps']])
172 | 
173 |             step_size <-  torch::torch_full_like(
174 |                 input = denom,
175 |                 fill_value = step_size)
176 |             step_size$div_(denom)$clamp_(lower_bound, upper_bound)$mul_(exp_avg)
177 | 
178 |             param$add_(-step_size)
179 |         }
180 | 
181 |         private$step_helper(closure, loop_fun)
182 |     }
183 | )
184 | 


--------------------------------------------------------------------------------
/R/adahessian.R:
--------------------------------------------------------------------------------
  1 | #'@title Adahessian optimizer
  2 | #'
  3 | #'@name optim_adahessian
  4 | #'
  5 | #'@author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  6 | #'@author Felipe Souza, \email{lipecaso@@gmail.com}
  7 | #'@author Alber Sanchez, \email{alber.ipia@@inpe.br}
  8 | #'@author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  9 | #'
 10 | #'@description  R implementation of the Adahessian optimizer proposed
 11 | #' by Yao et al.(2020). The original implementation is available at
 12 | #' https://github.com/amirgholami/adahessian.
 13 | #'
 14 | #' @references
 15 | #' Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K.,
 16 | #' & Mahoney, M. (2021).
 17 | #' ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning.
 18 | #' Proceedings of the AAAI Conference on Artificial Intelligence, 35(12),
 19 | #' 10665-10673.
 20 | #' https://arxiv.org/abs/2006.00719
 21 | #'
 22 | #' @param params                        Iterable of parameters to optimize.
 23 | #' @param lr                            Learning rate (default: 0.15).
 24 | #' @param betas                         Coefficients for computing
 25 | #'                                      running averages of gradient
 26 | #'                                      and is square(default: (0.9, 0.999)).
 27 | #' @param eps                           Term added to the denominator to improve
 28 | #'                                      numerical stability (default: 1e-4).
 29 | #' @param weight_decay                  L2 penalty (default: 0).
 30 | #' @param hessian_power                 Hessian power (default: 1.0).
 31 | #'
 32 | #'
 33 | #' @returns
 34 | #' An optimizer object implementing the `step` and `zero_grad` methods.
 35 | #' @export
 36 | optim_adahessian <- torch::optimizer(
 37 |     "optim_adahessian",
 38 |     initialize = function(
 39 |         params,
 40 |         lr = 0.15,
 41 |         betas = c(0.9, 0.999),
 42 |         eps = 1e-4,
 43 |         weight_decay = 0,
 44 |         hessian_power = 0.5
 45 |     ) {
 46 |         if (lr <= 0.0)
 47 |             rlang::abort("Learning rate must be positive.")
 48 |         if (eps <= 0.0)
 49 |             rlang::abort("eps must be non-negative.")
 50 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 51 |             rlang::abort("Invalid beta1 parameter.")
 52 |         if (betas[2] > 1.0 | betas[2] <= 0.0)
 53 |             rlang::abort("Invalid beta2 parameter.")
 54 |         if (hessian_power > 1.0 | hessian_power <= 0.0)
 55 |             rlang::abort("Invalid hessian power parameter.")
 56 |         if (weight_decay < 0)
 57 |             rlang::abort("Invalid weight_decay value")
 58 | 
 59 |         torch::torch_manual_seed(sample.int(10^5, 1))
 60 | 
 61 |         defaults = list(
 62 |             lr           = lr,
 63 |             betas        = betas,
 64 |             eps          = eps,
 65 |             hessian_power = hessian_power,
 66 |             weight_decay = weight_decay
 67 |         )
 68 |         super$initialize(params, defaults)
 69 |     },
 70 |     #     Get an estimate of Hessian Trace.
 71 |     #     This is done by computing the Hessian vector product with a random
 72 |     #     vector v at the current gradient point, to estimate Hessian trace by
 73 |     #     computing the gradient of <gradsH,v>.
 74 |     get_trace = function(params, grads){
 75 |         # Check backward was called with create_graph set to True
 76 |         purrr::map(grads, function(g) {
 77 |             if (purrr::is_null(g$grad_fn)) {
 78 |                 msg <- paste("Gradient tensor does not have grad_fn",
 79 |                              "When calling loss.backward(), set create_graph to True.")
 80 |                 rlang::abort(msg)
 81 |             }
 82 |         })
 83 |         # list of random tensors [-1, 1] to estimate Hessian matrix diagonal
 84 |         v <- purrr::map(params, function(p){
 85 |             return(2 * torch::torch_randint_like(input = p,
 86 |                                                  low = 0,
 87 |                                                  high = 2) - 1)
 88 |         })
 89 |         # Computes the sum of gradients of outputs w.r.t. the inputs.
 90 |         hvs <- torch::autograd_grad(
 91 |             outputs = grads,
 92 |             inputs  = params,
 93 |             grad_outputs = v,
 94 |             retain_graph = TRUE,
 95 |             create_graph = TRUE
 96 |         )
 97 | 
 98 |         # calculate hutchinson trace
 99 |         # approximation of hessian diagonal
100 |         hutchinson_trace <- purrr::map(seq_along(hvs), function(hv_ind){
101 |             hv <- hvs[[hv_ind]]
102 |             param_size <-  hv$size()
103 |             hv_abs <- hv$abs()
104 |             if (length(param_size) <= 2) {
105 |                 return(hv_abs)
106 |             } else if (length(param_size) == 3) {
107 |                 return(torch::torch_mean(hv_abs, dim = 1, keepdim = TRUE))
108 |             } else if (length(param_size) == 4) {
109 |                 return(torch::torch_mean(hv_abs, dim = c(2, 3), keepdim = TRUE))
110 |             } else
111 |                 rlang::abort("Only 1D to 4D tensors are supported.")
112 |         })
113 |         return(hutchinson_trace)
114 |     },
115 |     step = function(closure = NULL) {
116 | 
117 |         # #  Flatten params and grads into lists
118 |         groups <- self$param_groups[[1]]
119 |         params <- purrr::map(groups$params, function(pg){
120 |                 return(pg)
121 |         })
122 |         grads <- purrr::map(params, function(p) {
123 |             if (!is.null(p$grad))
124 |                 return(p$grad)
125 |         })
126 |         # Get the Hessian diagonal
127 |         self$hut_traces <- self$get_trace(params, grads)
128 | 
129 |         loop_fun <- function(group, param, g, p) {
130 | 
131 |             # state initialization
132 |             if (length(state(param)) == 0) {
133 |                 state(param) <- list()
134 |                 state(param)[["step"]] <- 0
135 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
136 |                 state(param)[["exp_hessian_diag_sq"]] <- torch::torch_zeros_like(param)
137 |             }
138 |             # Perform correct stepweight decay as in AdamW
139 |             # param$mul_(1 - group[['lr']] * group[['weight_decay']])
140 | 
141 |             exp_avg             <- state(param)[["exp_avg"]]
142 |             exp_hessian_diag_sq <- state(param)[["exp_hessian_diag_sq"]]
143 | 
144 |             # increase step
145 |             state(param)[["step"]] <- state(param)[["step"]] + 1
146 | 
147 |             # parameters for optimizer
148 |             beta1 <-  group[['betas']][[1]]
149 |             beta2 <-  group[['betas']][[2]]
150 |             lr    <-  group[['lr']]
151 |             eps   <-  group[['eps']]
152 |             wd    <-  group[['weight_decay']]
153 |             k     <-  group[['hessian_power']]
154 |             step  <-  state(param)[["step"]]
155 | 
156 | 
157 |             # Decay the first and second moment
158 |             # running average coefficient
159 |             exp_avg$mul_(beta1)$add_(param$grad, alpha = 1 - beta1)
160 |             exp_hessian_diag_sq$mul_(beta2)$addcmul_(
161 |                 self$hut_traces[[p]],
162 |                 self$hut_traces[[p]],
163 |                 value = 1 - beta2
164 |             )
165 | 
166 |             # bias correction
167 |             bias_correction1 <-  1 - beta1 ^ step
168 |             bias_correction2 <-  1 - beta2 ^ step
169 |             sqrt_bc2 <- sqrt(bias_correction2)
170 | 
171 | 
172 |             # make the square root, and the Hessian power
173 |             denom <- ((exp_hessian_diag_sq$sqrt() ^ k) / (sqrt_bc2 ^ k))$add_(eps)
174 | 
175 |             # update
176 |             param$sub_(lr * (exp_avg / bias_correction1 / denom
177 |                              + wd * param))
178 |         }
179 |         private$step_helper(closure, loop_fun)
180 |     }
181 | )
182 | 


--------------------------------------------------------------------------------
/R/adamw.R:
--------------------------------------------------------------------------------
  1 | #' @title AdamW optimizer
  2 | #'
  3 | #' @name optim_adamw
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
  9 | #'
 10 | #' @description
 11 | #' R implementation of the AdamW optimizer proposed
 12 | #' by Loshchilov & Hutter (2019). We used the pytorch implementation
 13 | #' developed by Collin Donahue-Oponski available at:
 14 | #' https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3
 15 | #'
 16 | #' From the abstract by the paper by Loshchilov & Hutter (2019):
 17 | #' L2 regularization and weight decay regularization are equivalent for standard
 18 | #' stochastic gradient descent (when rescaled by the learning rate),
 19 | #' but as we demonstrate this is not the case for adaptive gradient algorithms,
 20 | #' such as Adam. While common implementations of these algorithms
 21 | #' employ L2 regularization (often calling it “weight decay”
 22 | #' in what may be misleading due to the inequivalence we expose),
 23 | #' we propose a simple modification to recover the original formulation of
 24 | #' weight decay regularization by decoupling the weight decay from the optimization
 25 | #' steps taken w.r.t. the loss function
 26 | #'
 27 | #' @references
 28 | #' Ilya Loshchilov, Frank Hutter,
 29 | #' "Decoupled Weight Decay Regularization",
 30 | #' International Conference on Learning Representations (ICLR) 2019.
 31 | #' https://arxiv.org/abs/1711.05101
 32 | #'
 33 | #' @param params       List of parameters to optimize.
 34 | #' @param lr           Learning rate (default: 1e-3)
 35 | #' @param betas        Coefficients computing running averages of gradient
 36 | #'   and its square (default: (0.9, 0.999))
 37 | #' @param eps          Term added to the denominator to improve numerical
 38 | #'   stability (default: 1e-8)
 39 | #' @param weight_decay Weight decay (L2 penalty) (default: 1e-6)
 40 | #'
 41 | #' @returns
 42 | #' A torch optimizer object implementing the `step` method.
 43 | #' @examples
 44 | #' if (torch::torch_is_installed()) {
 45 | 
 46 | #' # function to demonstrate optimization
 47 | #' beale <- function(x, y) {
 48 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 49 | #'  }
 50 | #' # define optimizer
 51 | #' optim <- torchopt::optim_adamw
 52 | #' # define hyperparams
 53 | #' opt_hparams <- list(lr = 0.01)
 54 | #'
 55 | #' # starting point
 56 | #' x0 <- 3
 57 | #' y0 <- 3
 58 | #' # create tensor
 59 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 60 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 61 | #' # instantiate optimizer
 62 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 63 | #' # run optimizer
 64 | #' steps <- 400
 65 | #' x_steps <- numeric(steps)
 66 | #' y_steps <- numeric(steps)
 67 | #' for (i in seq_len(steps)) {
 68 | #'     x_steps[i] <- as.numeric(x)
 69 | #'     y_steps[i] <- as.numeric(y)
 70 | #'     optim$zero_grad()
 71 | #'     z <- beale(x, y)
 72 | #'     z$backward()
 73 | #'     optim$step()
 74 | #' }
 75 | #' print(paste0("starting value = ", beale(x0, y0)))
 76 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 77 | #' }
 78 | #' @export
 79 | optim_adamw <- torch::optimizer(
 80 |     "optim_adamw",
 81 |     initialize = function(params,
 82 |                           lr = 0.01,
 83 |                           betas = c(0.9, 0.999),
 84 |                           eps = 1e-8,
 85 |                           weight_decay = 1e-6) {
 86 |         if (lr <= 0.0)
 87 |             stop("Learning rate must be positive.", call. = FALSE)
 88 |         if (eps < 0.0)
 89 |             stop("eps must be non-negative.", call. = FALSE)
 90 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 91 |             stop("Invalid beta parameter.", call. = FALSE)
 92 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
 93 |             stop("Invalid beta parameter.", call. = FALSE)
 94 |         if (weight_decay < 0)
 95 |             stop("Invalid weight_decay value.", call. = FALSE)
 96 | 
 97 |         defaults = list(
 98 |             lr           = lr,
 99 |             betas        = betas,
100 |             eps          = eps,
101 |             weight_decay = weight_decay
102 |         )
103 |         super$initialize(params, defaults)
104 |     },
105 |     step = function(closure = NULL){
106 |         loop_fun <- function(group, param, g, p) {
107 |             if (is.null(param$grad))
108 |                 next
109 |             grad <- param$grad
110 | 
111 |             # State initialization
112 |             if (length(state(param)) == 0) {
113 |                 state(param) <- list()
114 |                 state(param)[["step"]] <- 0
115 |                 # Exponential moving average of gradient values
116 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
117 |                 # Exponential moving average of squared gradient values
118 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param)
119 |             }
120 |             # Define variables for optimization function
121 |             exp_avg      <- state(param)[["exp_avg"]]
122 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
123 |             beta1        <- group[['betas']][[1]]
124 |             beta2        <- group[['betas']][[2]]
125 |             weight_decay <- group[['weight_decay']]
126 |             eps          <- group[["eps"]]
127 |             lr           <- group[['lr']]
128 | 
129 |             # take one step
130 |             state(param)[["step"]] <- state(param)[["step"]] + 1
131 | 
132 |             # Decay the first moment
133 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
134 |             # Decay the second moment
135 |             exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2))
136 | 
137 |             # calculate denominator
138 |             denom = exp_avg_sq$sqrt()$add_(eps)
139 | 
140 |             # bias correction
141 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
142 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
143 |             # calculate step size
144 |             step_size <- lr * sqrt(bias_correction2) / bias_correction1
145 | 
146 |             # L2 correction (different from adam)
147 |             if (weight_decay != 0)
148 |                 param$add_(param, -weight_decay * lr)
149 |             # go to next step
150 |             param$addcdiv_(exp_avg, denom, value = -step_size)
151 |         }
152 |         private$step_helper(closure, loop_fun)
153 |     }
154 | )
155 | 


--------------------------------------------------------------------------------
/R/madgrad.R:
--------------------------------------------------------------------------------
  1 | #' @title MADGRAD optimizer
  2 | #'
  3 | #' @name optim_madgrad
  4 | #'
  5 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com}
  6 | #'
  7 | #' @description
  8 | #' A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
  9 | #' Optimization (MADGRAD) is a general purpose optimizer that
 10 | #' can be used in place of SGD or Adam may converge faster and generalize
 11 | #' better. Currently GPU-only. Typically, the same learning rate schedule
 12 | #' that is used for SGD or Adam may be used. The overall learning rate is
 13 | #' not comparable to either method and should be determined by a
 14 | #' hyper-parameter sweep.
 15 | #'
 16 | #' MADGRAD requires less weight decay than other methods, often as little as
 17 | #' zero. Momentum values used for SGD or Adam's beta1 should work here also.
 18 | #'
 19 | #' On sparse problems both weight_decay and momentum should be set to 0.
 20 | #' (not yet supported in the R implementation).
 21 | #'
 22 | #'
 23 | #' @references
 24 | #' Aaron Defazio, Samy Jelassi,
 25 | #' "Adaptivity without Compromise: A Momentumized, Adaptive, Dual
 26 | #' Averaged Gradient Method for Stochastic Optimization".
 27 | #' https://arxiv.org/abs/2101.11075
 28 | #'
 29 | #' @param params        List of parameters to optimize.
 30 | #' @param lr            Learning rate (default: 1e-2).
 31 | #' @param momentum      Momentum value in  the range [0,1) (default: 0.9).
 32 | #' @param weight_decay  Weight decay, i.e. a L2 penalty (default: 0).
 33 | #' @param eps           Term added to the denominator outside of
 34 | #'                      the root operation to improve numerical stability
 35 | #'                      (default: 1e-6).
 36 | #'
 37 | #' @returns
 38 | #' A torch optimizer object implementing the `step` method.
 39 | #' @examples
 40 | #' if (torch::torch_is_installed()) {
 41 | 
 42 | #' # function to demonstrate optimization
 43 | #' beale <- function(x, y) {
 44 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 45 | #'  }
 46 | #' # define optimizer
 47 | #' optim <- torchopt::optim_madgrad
 48 | #' # define hyperparams
 49 | #' opt_hparams <- list(lr = 0.01)
 50 | #'
 51 | #' # starting point
 52 | #' x0 <- 3
 53 | #' y0 <- 3
 54 | #' # create tensor
 55 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 56 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 57 | #' # instantiate optimizer
 58 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 59 | #' # run optimizer
 60 | #' steps <- 400
 61 | #' x_steps <- numeric(steps)
 62 | #' y_steps <- numeric(steps)
 63 | #' for (i in seq_len(steps)) {
 64 | #'     x_steps[i] <- as.numeric(x)
 65 | #'     y_steps[i] <- as.numeric(y)
 66 | #'     optim$zero_grad()
 67 | #'     z <- beale(x, y)
 68 | #'     z$backward()
 69 | #'     optim$step()
 70 | #' }
 71 | #' print(paste0("starting value = ", beale(x0, y0)))
 72 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 73 | #' }
 74 | #' @export
 75 | optim_madgrad <- torch::optimizer(
 76 |     "optim_madgrad",
 77 |     initialize = function(params,
 78 |                           lr = 1e-2,
 79 |                           momentum = 0.9,
 80 |                           weight_decay = 0,
 81 |                           eps = 1e-6) {
 82 | 
 83 |         if (momentum < 0 || momentum >= 1)
 84 |             stop("Momentum must be in the range [0,1].")
 85 | 
 86 |         if (lr <= 0)
 87 |             stop("Learning rate must be positive.")
 88 | 
 89 |         if (weight_decay < 0)
 90 |             stop("Weight decay must be non-negative.")
 91 | 
 92 |         if (eps < 0)
 93 |             stop("Eps must be non-negative.")
 94 | 
 95 |         defaults <- list(lr = lr,
 96 |                          eps = eps,
 97 |                          momentum = momentum,
 98 |                          weight_decay = weight_decay)
 99 | 
100 |         super$initialize(params, defaults)
101 |     },
102 |     step = function(closure = NULL) {
103 |         if (is.null(self$k))
104 |             self$k <- 0
105 |         loss <- super$step_helper(
106 |             closure = closure,
107 |             loop_fun = function(group, param, ...) {
108 |                 eps <- group$eps
109 |                 lr <- group$lr + eps
110 |                 decay <- group$weight_decay
111 |                 momentum <- group$momentum
112 | 
113 |                 ck <- 1 - momentum
114 |                 lamb <- lr * (self$k + 1)^0.5
115 | 
116 |                 grad <- param$grad
117 | 
118 |                 if (is.null(state(param))) {
119 |                     state(param) <- list()
120 |                     state(param)[["grad_sum_sq"]] <- torch::torch_zeros_like(param)$detach()
121 |                     state(param)[["s"]] <- torch::torch_zeros_like(param)$detach()
122 |                     if (momentum != 0)
123 |                         state(param)[["x0"]] <- param$clone()
124 |                 }
125 | 
126 |                 if (decay != 0) {
127 |                     grad$add_(param, alpha = decay)
128 |                 }
129 | 
130 |                 if (momentum == 0) {
131 |                     # Compute x_0 from other known quantities
132 |                     rms <- state(param)[["grad_sum_sq"]]$pow(1 / 3)$add_(eps)
133 |                     x0 <- param$addcdiv(state(param)[["s"]], rms, value = 1)
134 |                 } else {
135 |                     x0 <- state(param)[["x0"]]
136 |                 }
137 | 
138 |                 # Accumulate second moments
139 |                 state(param)[["grad_sum_sq"]]$addcmul_(grad, grad, value = lamb)
140 |                 rms <- state(param)[["grad_sum_sq"]]$pow(1 / 3)$add_(eps)
141 | 
142 |                 # Update s
143 |                 state(param)[["s"]]$add_(grad, alpha = lamb)
144 | 
145 |                 # Step
146 |                 if (momentum == 0) {
147 |                     param$copy_(x0$addcdiv(state(param)[["s"]], rms, value = -1))
148 |                 } else {
149 |                     z <- x0$addcdiv(state(param)[["s"]], rms, value = -1)
150 |                 }
151 | 
152 |                 # p is a moving average of z
153 |                 param$mul_(1 - ck)$add_(z, alpha = ck)
154 | 
155 |             })
156 |         self$k <- self$k + 1
157 |         loss
158 |     }
159 | )
160 | 
161 | 
162 | state <- function(self) {
163 |     attr(self, "state")
164 | }
165 | 
166 | `state<-` <- function(self, value) {
167 |     attr(self, "state") <- value
168 |     self
169 | }
170 | 
171 | 


--------------------------------------------------------------------------------
/R/nadam.R:
--------------------------------------------------------------------------------
  1 | #' @title Nadam optimizer
  2 | #'
  3 | #' @name optim_nadam
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
  9 | #'
 10 | #' @description
 11 | #' R implementation of the Nadam optimizer proposed
 12 | #' by Dazat (2016).
 13 | #'
 14 | #' From the abstract by the paper by Dozat (2016):
 15 | #' This work aims to improve upon the recently proposed and
 16 | #' rapidly popularized optimization algorithm Adam (Kingma & Ba, 2014).
 17 | #' Adam has two main components—a momentum component and an adaptive
 18 | #' learning rate component. However, regular momentum can be shown conceptually
 19 | #' and empirically to be inferior to a similar algorithm known as
 20 | #' Nesterov’s accelerated gradient (NAG).
 21 | #'
 22 | #' @references
 23 | #' Timothy Dozat,
 24 | #' "Incorporating Nesterov Momentum into Adam",
 25 | #' International Conference on Learning Representations (ICLR) 2016.
 26 | #' https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf
 27 | #'
 28 | #' @param params              List of parameters to optimize.
 29 | #' @param lr                  Learning rate (default: 1e-3)
 30 | #' @param betas               Coefficients computing running averages of gradient
 31 | #'                            and its square (default: (0.9, 0.999)).
 32 | #' @param eps                 Term added to the denominator to improve numerical
 33 | #'                            stability (default: 1e-8).
 34 | #' @param weight_decay        Weight decay (L2 penalty) (default: 0).
 35 | #' @param momentum_decay      Momentum_decay (default: 4e-3).
 36 | #'
 37 | #'
 38 | #' @returns
 39 | #' A torch optimizer object implementing the `step` method.
 40 | #' @examples
 41 | #' if (torch::torch_is_installed()) {
 42 | 
 43 | #' # function to demonstrate optimization
 44 | #' beale <- function(x, y) {
 45 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 46 | #'  }
 47 | #' # define optimizer
 48 | #' optim <- torchopt::optim_nadam
 49 | #' # define hyperparams
 50 | #' opt_hparams <- list(lr = 0.01)
 51 | #'
 52 | #' # starting point
 53 | #' x0 <- 3
 54 | #' y0 <- 3
 55 | #' # create tensor
 56 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 57 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 58 | #' # instantiate optimizer
 59 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 60 | #' # run optimizer
 61 | #' steps <- 400
 62 | #' x_steps <- numeric(steps)
 63 | #' y_steps <- numeric(steps)
 64 | #' for (i in seq_len(steps)) {
 65 | #'     x_steps[i] <- as.numeric(x)
 66 | #'     y_steps[i] <- as.numeric(y)
 67 | #'     optim$zero_grad()
 68 | #'     z <- beale(x, y)
 69 | #'     z$backward()
 70 | #'     optim$step()
 71 | #' }
 72 | #' print(paste0("starting value = ", beale(x0, y0)))
 73 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 74 | #' }
 75 | #' @export
 76 | optim_nadam <- torch::optimizer(
 77 |     "optim_nadam",
 78 |     initialize = function(params,
 79 |                           lr = 0.002,
 80 |                           betas = c(0.9, 0.999),
 81 |                           eps = 1e-8,
 82 |                           weight_decay = 0,
 83 |                           momentum_decay = 4.0e-03) {
 84 |         if (lr <= 0.0)
 85 |             stop("Learning rate must be positive.", call. = FALSE)
 86 |         if (eps < 0.0)
 87 |             stop("eps must be non-negative.", call. = FALSE)
 88 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 89 |             stop("Invalid beta parameter.", call. = FALSE)
 90 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
 91 |             stop("Invalid beta parameter.", call. = FALSE)
 92 |         if (weight_decay < 0)
 93 |             stop("Invalid weight_decay value.", call. = FALSE)
 94 |         if (momentum_decay < 0)
 95 |             stop("Invalid momentum_decay value.", call. = FALSE)
 96 | 
 97 |         defaults = list(
 98 |             lr             = lr,
 99 |             betas          = betas,
100 |             eps            = eps,
101 |             weight_decay   = weight_decay,
102 |             momentum_decay = momentum_decay
103 |         )
104 |         super$initialize(params, defaults)
105 |     },
106 |     step = function(closure = NULL){
107 |         loop_fun <- function(group, param, g, p) {
108 |             if (is.null(param$grad))
109 |                 next
110 |             grad <- param$grad
111 | 
112 |             # State initialization
113 |             if (length(state(param)) == 0) {
114 |                 state(param) <- list()
115 |                 state(param)[["step"]] <- torch::torch_tensor(0)
116 |                 # momentum product
117 |                 state(param)[["mu_product"]] <- torch::torch_tensor(1.)
118 |                 # Exponential moving average of gradient values
119 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
120 |                 # Exponential moving average of squared gradient values
121 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param)
122 |             }
123 |             # Define variables for optimization function
124 |             exp_avg      <- state(param)[["exp_avg"]]
125 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
126 |             step         <- state(param)[["step"]]
127 |             mu_product   <- state(param)[["mu_product"]]
128 |             beta1        <- group[['betas']][[1]]
129 |             beta2        <- group[['betas']][[2]]
130 |             weight_decay <- group[['weight_decay']]
131 |             eps          <- group[["eps"]]
132 |             lr           <- group[['lr']]
133 |             momentum_decay <- group[["momentum_decay"]]
134 | 
135 |             # take one step
136 |             state(param)[["step"]] <- state(param)[["step"]] + 1
137 | 
138 |             # bias correction
139 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
140 | 
141 |             # weight_decay
142 |             if (weight_decay != 0)
143 |                grad = grad$add(param, alpha = weight_decay)
144 | 
145 |             # calculate the momentum cache \mu^{t} and \mu^{t+1}
146 |             mu = beta1 * (1. - 0.5 * (0.96 ^ (step * momentum_decay)))
147 |             mu_next = beta1 * (1. - 0.5 * (0.96 ^ ((step + 1) * momentum_decay)))
148 | 
149 |             # update momentum
150 |             mu_product <- mu_product * mu
151 |             mu_product_next <- mu_product * mu * mu_next
152 | 
153 |             # decay the first and second moment running average coefficient
154 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
155 |             exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = 1 - beta2)
156 | 
157 |             # calculate denominator
158 |             denom = exp_avg_sq$div(bias_correction2)$sqrt()$add_(eps)
159 | 
160 |             # update objective function
161 |             param$addcdiv_(grad, denom,
162 |                            value = -lr * (1. - mu) / (1. - mu_product$item()))
163 |             param$addcdiv_(exp_avg, denom,
164 |                            value = -lr * mu_next / (1. - mu_product_next$item()))
165 | 
166 |         }
167 |         private$step_helper(closure, loop_fun)
168 |     }
169 | )
170 | 


--------------------------------------------------------------------------------
/R/qhadam.R:
--------------------------------------------------------------------------------
  1 | #' @title QHAdam optimization algorithm
  2 | #'
  3 | #' @name optim_qhadam
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com}
  7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
 10 | #'
 11 | #' @description
 12 | #' R implementation of the QHAdam optimizer proposed
 13 | #' by Ma and Yarats(2019). We used the implementation available at
 14 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/qhadam.py.
 15 | #' Thanks to Nikolay Novik for providing the pytorch code.
 16 | #'
 17 | #' The original implementation has been developed by Facebook AI
 18 | #' and is licensed using the MIT license.
 19 | #'
 20 | #' From the the paper by Ma and Yarats(2019):
 21 | #' QHAdam is a QH augmented version of Adam, where we
 22 | #' replace both of Adam's moment estimators with quasi-hyperbolic terms.
 23 | #' QHAdam decouples the momentum term from the current gradient when
 24 | #' updating the weights, and decouples the mean squared gradients
 25 | #' term from the current squared gradient when updating the weights.
 26 | #'
 27 | #'
 28 | #' @references
 29 | #' Jerry Ma, Denis Yarats,
 30 | #' "Quasi-hyperbolic momentum and Adam for deep learning".
 31 | #'  https://arxiv.org/abs/1810.06801
 32 | #'
 33 | #' @param params         List of parameters to optimize.
 34 | #' @param lr             Learning rate (default: 1e-3)
 35 | #' @param betas          Coefficients computing running averages of gradient
 36 | #'                       and its square (default: (0.9, 0.999))
 37 | #' @param nus            Immediate discount factors used to
 38 | #'                       estimate the gradient and its square
 39 | #'                       (default: (1.0, 1.0))
 40 | #' @param eps            Term added to the denominator to improve numerical
 41 | #'                       stability (default: 1e-8)
 42 | #' @param weight_decay   Weight decay (L2 penalty) (default: 0)
 43 | #' @param decouple_weight_decay Whether to decouple the weight
 44 | #'                        decay from the gradient-based optimization step.
 45 | #'
 46 | #' @returns
 47 | #' A torch optimizer object implementing the `step` method.
 48 | #' @examples
 49 | #' if (torch::torch_is_installed()) {
 50 | 
 51 | #' # function to demonstrate optimization
 52 | #' beale <- function(x, y) {
 53 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 54 | #'  }
 55 | #' # define optimizer
 56 | #' optim <- torchopt::optim_qhadam
 57 | #' # define hyperparams
 58 | #' opt_hparams <- list(lr = 0.01)
 59 | #'
 60 | #' # starting point
 61 | #' x0 <- 3
 62 | #' y0 <- 3
 63 | #' # create tensor
 64 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 65 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 66 | #' # instantiate optimizer
 67 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 68 | #' # run optimizer
 69 | #' steps <- 400
 70 | #' x_steps <- numeric(steps)
 71 | #' y_steps <- numeric(steps)
 72 | #' for (i in seq_len(steps)) {
 73 | #'     x_steps[i] <- as.numeric(x)
 74 | #'     y_steps[i] <- as.numeric(y)
 75 | #'     optim$zero_grad()
 76 | #'     z <- beale(x, y)
 77 | #'     z$backward()
 78 | #'     optim$step()
 79 | #' }
 80 | #' print(paste0("starting value = ", beale(x0, y0)))
 81 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 82 | #' }
 83 | #'
 84 | #' @export
 85 | optim_qhadam <- torch::optimizer(
 86 |     "optim_qhadam",
 87 |     initialize = function(params,
 88 |                           lr = 0.01,
 89 |                           betas = c(0.9, 0.999),
 90 |                           eps = 0.001,
 91 |                           nus = c(1.0, 1.0),
 92 |                           weight_decay = 0,
 93 |                           decouple_weight_decay = FALSE) {
 94 |         if (lr <= 0.0)
 95 |             stop("Learning rate must be positive.", call. = FALSE)
 96 |         if (eps < 0.0)
 97 |             stop("eps must be non-negative.", call. = FALSE)
 98 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 99 |             stop("Invalid beta parameter.", call. = FALSE)
100 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
101 |             stop("Invalid beta parameter.", call. = FALSE)
102 |         if (weight_decay < 0)
103 |             stop("Invalid weight_decay value.", call. = FALSE)
104 | 
105 |         defaults = list(
106 |             lr                    = lr,
107 |             betas                 = betas,
108 |             eps                   = eps,
109 |             nus                   = nus,
110 |             weight_decay          = weight_decay,
111 |             decouple_weight_decay = decouple_weight_decay
112 |         )
113 |         super$initialize(params, defaults)
114 |     },
115 |     step = function(closure = NULL) {
116 |         loop_fun <- function(group, param, g, p) {
117 |             if (is.null(param$grad))
118 |                 next
119 | 
120 |             # define parameters
121 |             beta1        <- group[['betas']][[1]]
122 |             beta2        <- group[['betas']][[2]]
123 |             nu1          <- group[['nus']][[1]]
124 |             nu2          <- group[['nus']][[2]]
125 |             weight_decay <- group[['weight_decay']]
126 |             decouple_weight_decay <-  group[["decouple_weight_decay"]]
127 |             eps          <- group[["eps"]]
128 |             lr           <- group[['lr']]
129 | 
130 |             d_p <- param$grad
131 | 
132 |             if (weight_decay != 0) {
133 |                 if (decouple_weight_decay)
134 |                     param$mul_(1 - lr * weight_decay)
135 |                 else
136 |                     d_p$add_(weight_decay, param)
137 |             }
138 | 
139 |             d_p_sq = d_p$mul(d_p)
140 | 
141 | 
142 |             # State initialization
143 |             # State initialization
144 |             if (length(state(param)) == 0) {
145 |                 state(param) <- list()
146 | 
147 |                 state(param)[["beta1_weight"]] <-  0.0
148 |                 state(param)[["beta2_weight"]] <-  0.0
149 |                 # Exponential moving average of gradient values
150 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
151 |                 # Exponential moving average of squared gradient values
152 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param)
153 |             }
154 |             # Define variables for optimization function
155 |             state(param)[["beta1_weight"]] <-  1.0 + beta1 * state(param)[["beta1_weight"]]
156 |             state(param)[["beta2_weight"]] <-  1.0 + beta2 * state(param)[["beta2_weight"]]
157 | 
158 |             beta1_weight <-  state(param)[["beta1_weight"]]
159 |             beta2_weight <-  state(param)[["beta2_weight"]]
160 | 
161 |             exp_avg      <- state(param)[["exp_avg"]]
162 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
163 | 
164 |             beta1_adj <-  1.0 - (1.0 / beta1_weight)
165 |             beta2_adj <-  1.0 - (1.0 / beta2_weight)
166 |             exp_avg$mul_(beta1_adj)$add_(d_p, alpha = 1.0 - beta1_adj)
167 |             exp_avg_sq$mul_(beta2_adj)$add_(d_p_sq, alpha = 1.0 - beta2_adj)
168 | 
169 |             avg_grad <-  exp_avg$mul(nu1)
170 |             if (nu1 != 1.0)
171 |                 avg_grad$add_(d_p, alpha = 1.0 - nu1)
172 | 
173 |             avg_grad_rms = exp_avg_sq$mul(nu2)
174 |             if (nu2 != 1.0)
175 |                 avg_grad_rms$add_(d_p_sq, alpha = 1.0 - nu2)
176 |             avg_grad_rms$sqrt_()
177 |             if (eps != 0.0)
178 |                 avg_grad_rms$add_(eps)
179 | 
180 |             param$addcdiv_(avg_grad, avg_grad_rms, value = -lr)
181 |         }
182 |         private$step_helper(closure, loop_fun)
183 |     }
184 | )
185 | 


--------------------------------------------------------------------------------
/R/radam.R:
--------------------------------------------------------------------------------
  1 | #' @title AdamW optimizer
  2 | #'
  3 | #' @name optim_radam
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com}
  7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
 10 | #'
 11 | #' @description
 12 | #' R implementation of the RAdam optimizer proposed
 13 | #' by Liu et al. (2019).
 14 | #' We used the implementation in PyTorch as a basis for our
 15 | #' implementation.
 16 | #'
 17 | #' From the abstract by the paper by Liu et al. (2019):
 18 | #' The learning rate warmup heuristic achieves remarkable success
 19 | #' in stabilizing training, accelerating convergence and improving
 20 | #' generalization for adaptive stochastic optimization algorithms
 21 | #' like RMSprop and Adam. Here, we study its mechanism in details.
 22 | #' Pursuing the theory behind warmup, we identify a problem of the
 23 | #' adaptive learning rate (i.e., it has problematically large variance
 24 | #' in the early stage), suggest warmup works as a variance reduction
 25 | #' technique, and provide both empirical and theoretical evidence to verify
 26 | #' our hypothesis. We further propose RAdam, a new variant of Adam,
 27 | #' by introducing a term to rectify the variance of the adaptive learning rate.
 28 | #' Extensive experimental results on image classification, language modeling,
 29 | #' and neural machine translation verify our intuition and demonstrate
 30 | #' the effectiveness and robustness of our proposed method.
 31 | #'
 32 | #' @references
 33 | #' Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen,
 34 | #' Xiaodong Liu, Jianfeng Gao, Jiawei Han,
 35 | #' "On the Variance of the Adaptive Learning Rate and Beyond",
 36 | #' International Conference on Learning Representations (ICLR) 2020.
 37 | #' https://arxiv.org/abs/1908.03265
 38 | #'
 39 | #' @param params       List of parameters to optimize.
 40 | #' @param lr           Learning rate (default: 1e-3)
 41 | #' @param betas        Coefficients computing running averages of gradient
 42 | #'   and its square (default: (0.9, 0.999))
 43 | #' @param eps          Term added to the denominator to improve numerical
 44 | #'   stability (default: 1e-8)
 45 | #' @param weight_decay Weight decay (L2 penalty) (default: 0)
 46 | #'
 47 | #' @returns
 48 | #' A torch optimizer object implementing the `step` method.
 49 | #' @examples
 50 | #' if (torch::torch_is_installed()) {
 51 | 
 52 | #' # function to demonstrate optimization
 53 | #' beale <- function(x, y) {
 54 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 55 | #'  }
 56 | #' # define optimizer
 57 | #' optim <- torchopt::optim_radam
 58 | #' # define hyperparams
 59 | #' opt_hparams <- list(lr = 0.01)
 60 | #'
 61 | #' # starting point
 62 | #' x0 <- 3
 63 | #' y0 <- 3
 64 | #' # create tensor
 65 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 66 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 67 | #' # instantiate optimizer
 68 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 69 | #' # run optimizer
 70 | #' steps <- 400
 71 | #' x_steps <- numeric(steps)
 72 | #' y_steps <- numeric(steps)
 73 | #' for (i in seq_len(steps)) {
 74 | #'     x_steps[i] <- as.numeric(x)
 75 | #'     y_steps[i] <- as.numeric(y)
 76 | #'     optim$zero_grad()
 77 | #'     z <- beale(x, y)
 78 | #'     z$backward()
 79 | #'     optim$step()
 80 | #' }
 81 | #' print(paste0("starting value = ", beale(x0, y0)))
 82 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 83 | #' }
 84 | #' @export
 85 | optim_radam <- torch::optimizer(
 86 |     "optim_radam",
 87 |     initialize = function(params,
 88 |                           lr = 0.01,
 89 |                           betas = c(0.9, 0.999),
 90 |                           eps = 1e-8,
 91 |                           weight_decay = 0) {
 92 |         if (lr <= 0.0)
 93 |             stop("Learning rate must be positive.", call. = FALSE)
 94 |         if (eps < 0.0)
 95 |             stop("eps must be non-negative.", call. = FALSE)
 96 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 97 |             stop("Invalid beta parameter.", call. = FALSE)
 98 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
 99 |             stop("Invalid beta parameter.", call. = FALSE)
100 |         if (weight_decay < 0)
101 |             stop("Invalid weight_decay value.", call. = FALSE)
102 | 
103 |         defaults = list(
104 |             lr           = lr,
105 |             betas        = betas,
106 |             eps          = eps,
107 |             weight_decay = weight_decay
108 |         )
109 |         super$initialize(params, defaults)
110 |     },
111 |     step = function(closure = NULL){
112 |         loop_fun <- function(group, param, g, p) {
113 |             if (is.null(param$grad))
114 |                 next
115 |             grad <- param$grad
116 | 
117 |             # State initialization
118 |             if (length(state(param)) == 0) {
119 |                 state(param) <- list()
120 |                 state(param)[["step"]] <- 0
121 |                 # Exponential moving average of gradient values
122 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
123 |                 # Exponential moving average of squared gradient values
124 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param)
125 |             }
126 |             # Define variables for optimization function
127 |             exp_avg      <- state(param)[["exp_avg"]]
128 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
129 |             beta1        <- group[['betas']][[1]]
130 |             beta2        <- group[['betas']][[2]]
131 |             weight_decay <- group[['weight_decay']]
132 |             eps          <- group[["eps"]]
133 |             lr           <- group[['lr']]
134 | 
135 |             # take one step
136 |             state(param)[["step"]] <- state(param)[["step"]] + 1
137 |             step <- state(param)[["step"]]
138 | 
139 |             # bias correction
140 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
141 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
142 | 
143 |             # L2 correction
144 |             if (weight_decay != 0)
145 |                 grad$add_(param, alpha = weight_decay)
146 | 
147 | 
148 |             # Decay the first moment
149 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
150 |             # Decay the second moment
151 |             exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2))
152 | 
153 |             # correcting bias for the first moving moment
154 |             bias_corrected_exp_avg <-  exp_avg / bias_correction1
155 | 
156 |             # maximum length of the approximated SMA
157 |             rho_inf <-  2 / (1 - beta2) - 1
158 |             # compute the length of the approximated SMA
159 |             rho_t <-  rho_inf - 2 * step * (beta2^step) / bias_correction2
160 |             # adjust learning rate
161 |             if (rho_t > 5.0) {
162 |                 # Compute the variance rectification term and update parameters accordingly
163 |                 rect <- sqrt((rho_t - 4) * (rho_t - 2) * rho_inf /
164 |                                  ((rho_inf - 4) * (rho_inf - 2) * rho_t))
165 |                 adaptive_lr <- sqrt(bias_correction2) / exp_avg_sq$sqrt()$add_(eps)
166 |                 param$add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha = -1.0)
167 |             } else
168 |                 param$add_(bias_corrected_exp_avg * lr, alpha =- 1.0)
169 |         }
170 |         private$step_helper(closure, loop_fun)
171 |     }
172 | )
173 | 


--------------------------------------------------------------------------------
/R/swats.R:
--------------------------------------------------------------------------------
  1 | #' @title SWATS optimizer
  2 | #'
  3 | #' @name optim_swats
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com}
  7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
 10 | #'
 11 | #' @description
 12 | #' R implementation of the SWATS optimizer proposed
 13 | #' by Shekar and Sochee (2018).
 14 | #' We used the implementation available at
 15 | #' https://github.com/jettify/pytorch-optimizer/
 16 | #' Thanks to Nikolay Novik for providing the pytorch code.
 17 | #'
 18 | #' From the abstract by the paper by Shekar and Sochee (2018):
 19 | #' Adaptive optimization methods such as Adam, Adagrad or RMSprop
 20 | #' have been found to generalize poorly compared to
 21 | #' Stochastic gradient descent (SGD). These methods tend to perform well i
 22 | #' in the initial portion of training but are outperformed by SGD at
 23 | #' later stages of training. We investigate a hybrid strategy that begins
 24 | #' training with an adaptive method and switches to SGD
 25 | #' when a triggering condition is satisfied.
 26 | #' The condition we propose relates to the projection of Adam
 27 | #' steps on the gradient subspace. By design, the monitoring process
 28 | #' for this condition adds very little overhead and does not increase
 29 | #' the number of hyperparameters in the optimizer.
 30 | #'
 31 | #' @references
 32 | #' Nitish Shirish Keskar, Richard Socher
 33 | #' "Improving Generalization Performance by Switching from Adam to SGD".
 34 | #' International Conference on Learning Representations (ICLR) 2018.
 35 | #' https://arxiv.org/abs/1712.07628
 36 | #'
 37 | #' @param params       List of parameters to optimize.
 38 | #' @param lr           Learning rate (default: 1e-3)
 39 | #' @param betas        Coefficients computing running averages of gradient
 40 | #'                     and its square (default: (0.9, 0.999)).
 41 | #' @param eps          Term added to the denominator to improve numerical
 42 | #'                     stability (default: 1e-8).
 43 | #' @param weight_decay Weight decay (L2 penalty) (default: 0).
 44 | #' @param nesterov     Enables Nesterov momentum (default: False).
 45 | #'
 46 | #' @returns
 47 | #' A torch optimizer object implementing the `step` method.
 48 | #' @examples
 49 | #' if (torch::torch_is_installed()) {
 50 | 
 51 | #' # function to demonstrate optimization
 52 | #' beale <- function(x, y) {
 53 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 54 | #'  }
 55 | #' # define optimizer
 56 | #' optim <- torchopt::optim_swats
 57 | #' # define hyperparams
 58 | #' opt_hparams <- list(lr = 0.01)
 59 | #'
 60 | #' # starting point
 61 | #' x0 <- 3
 62 | #' y0 <- 3
 63 | #' # create tensor
 64 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 65 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 66 | #' # instantiate optimizer
 67 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 68 | #' # run optimizer
 69 | #' steps <- 400
 70 | #' x_steps <- numeric(steps)
 71 | #' y_steps <- numeric(steps)
 72 | #' for (i in seq_len(steps)) {
 73 | #'     x_steps[i] <- as.numeric(x)
 74 | #'     y_steps[i] <- as.numeric(y)
 75 | #'     optim$zero_grad()
 76 | #'     z <- beale(x, y)
 77 | #'     z$backward()
 78 | #'     optim$step()
 79 | #' }
 80 | #' print(paste0("starting value = ", beale(x0, y0)))
 81 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 82 | #' }
 83 | #' @export
 84 | optim_swats <- torch::optimizer(
 85 |     "optim_swats",
 86 |     initialize = function(params,
 87 |                           lr = 0.01,
 88 |                           betas = c(0.9, 0.999),
 89 |                           eps = 1e-8,
 90 |                           weight_decay = 0,
 91 |                           nesterov = FALSE) {
 92 |         if (lr <= 0.0)
 93 |             stop("Learning rate must be positive.", call. = FALSE)
 94 |         if (eps < 0.0)
 95 |             stop("eps must be non-negative.", call. = FALSE)
 96 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
 97 |             stop("Invalid beta parameter.", call. = FALSE)
 98 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
 99 |             stop("Invalid beta parameter.", call. = FALSE)
100 |         if (weight_decay < 0)
101 |             stop("Invalid weight_decay value.", call. = FALSE)
102 | 
103 |         defaults = list(
104 |             lr           = lr,
105 |             betas        = betas,
106 |             eps          = eps,
107 |             weight_decay = weight_decay,
108 |             nesterov     = nesterov,
109 |             phase        = "ADAM"
110 |         )
111 |         super$initialize(params, defaults)
112 |     },
113 |     step = function(closure = NULL){
114 |         loop_fun <- function(group, param, g, p) {
115 |             if (is.null(param$grad))
116 |                 next
117 |             grad <- param$grad
118 | 
119 |             # State initialization
120 |             if (length(state(param)) == 0) {
121 |                 state(param) <- list()
122 |                 state(param)[["step"]] <- 0
123 |                 # create momentum buffer
124 |                 state(param)[["momentum_buffer"]] <- NA
125 |                 # Exponential moving average of gradient values
126 |                 state(param)[["exp_avg"]] <- torch::torch_zeros_like(param)
127 |                 # Exponential moving average of squared gradient values
128 |                 state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param)
129 |                 # moving average for the non-orthogonal projection scaling
130 |                 # state(param)[["exp_avg2"]] <- param$new(1)$fill_(0)
131 |                 state(param)[["exp_avg2"]] <- param$new_zeros(1)
132 |             }
133 |             # Define variables for optimization function
134 |             exp_avg      <- state(param)[["exp_avg"]]
135 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
136 |             exp_avg2     <- state(param)[["exp_avg2"]]
137 |             beta1        <- group[['betas']][[1]]
138 |             beta2        <- group[['betas']][[2]]
139 |             weight_decay <- group[['weight_decay']]
140 |             eps          <- group[["eps"]]
141 |             lr           <- group[['lr']]
142 |             phase        <- group[["phase"]]
143 |             nesterov     <- group[["nesterov"]]
144 | 
145 |             # take one step
146 |             state(param)[["step"]] <- state(param)[["step"]] + 1
147 |             step <- state(param)[["step"]]
148 | 
149 |             # L2 correction
150 |             if (weight_decay != 0)
151 |                 grad$add_(param, alpha = weight_decay)
152 | 
153 |             # if its SGD phase, take an SGD update and continue
154 |             if (phase == 'SGD'){
155 |                 if (is.na(state(param)[["momentum_buffer"]])) {
156 |                     state(param)[["momentum_buffer"]] <-
157 |                         torch::torch_clone(grad)$detach()
158 |                     buf <- state(param)[["momentum_buffer"]]
159 |                 } else {
160 |                     buf <- state(param)[["momentum_buffer"]]
161 |                     buf$mul_(beta1)$add_(grad)
162 |                     grad <-  buf
163 |                     grad$mul_(1 - beta1)
164 |                     if (nesterov)
165 |                         grad$add_(buf, alpha = beta1)
166 |                     param$add_(grad, alpha = -lr)
167 |                     next
168 |                 }
169 |             }
170 | 
171 |             # Decay the first moment
172 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
173 |             # Decay the second moment
174 |             exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2))
175 |             # calculate denominator
176 |             denom = exp_avg_sq$sqrt()$add_(eps)
177 | 
178 |             # bias correction
179 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
180 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
181 | 
182 |             # calculate step size
183 |             step_size <- lr * (bias_correction2 ^ 0.5) / bias_correction1
184 | 
185 |             pf <-  -step_size * (exp_avg / denom)
186 |             param$add_(pf)
187 | 
188 |             p_view <-  pf$view(-1)
189 |             pg <- p_view$dot(grad$view(-1))
190 | 
191 |             if (as.logical(pg != 0)) {
192 |                 # the non-orthognal scaling estimate
193 |                 scaling <-  p_view$dot(p_view) / -pg
194 |                 exp_avg2$mul_(beta2)$add_(scaling, alpha = (1 - beta2))
195 | 
196 |                 # bias corrected exponential average
197 |                 corrected_exp_avg <- exp_avg2 / bias_correction2
198 | 
199 |                 # checking criteria of switching to SGD training
200 |                 if (as.logical(state(param)[['step']] > 1) &&
201 |                     as.logical(corrected_exp_avg$allclose(scaling, rtol = 1e-6)) &&
202 |                     as.logical(corrected_exp_avg > 0)
203 |                 ) {
204 |                     group[['phase']] <-  'SGD'
205 |                     group[['lr']] <- corrected_exp_avg$item()
206 |                 }
207 |             }
208 |         }
209 |         private$step_helper(closure, loop_fun)
210 |     }
211 | )
212 | 


--------------------------------------------------------------------------------
/R/torchopt-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | ## usethis namespace: start
 5 | #' @importFrom graphics contour
 6 | #' @importFrom graphics image
 7 | #' @importFrom graphics lines
 8 | #' @importFrom graphics points
 9 | #' @importFrom grDevices hcl.colors
10 | #' @importFrom stats runif
11 | ## usethis namespace: end
12 | NULL
13 | 
14 | # Include the following global variables
15 | utils::globalVariables(c("self", "super", "ctx", "private"))
16 | 
17 | 


--------------------------------------------------------------------------------
/R/utils-state.R:
--------------------------------------------------------------------------------
 1 | #' @title Imported function
 2 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com}
 3 | #' @keywords internal
 4 | #' @description Code lifted from a internal function of madgrad package.
 5 | #'   Get 'state' attribute of an object.
 6 | state <- function(self) {
 7 |     attr(self, "state")
 8 | }
 9 | 
10 | #' @title Imported function
11 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com}
12 | #' @keywords internal
13 | #' @description Code lifted from a internal function of madgrad package.
14 | #'   Set 'state' attribute of an object.
15 | `state<-` <- function(self, value) {
16 |     attr(self, "state") <- value
17 |     self
18 | }
19 | 


--------------------------------------------------------------------------------
/R/utils-testopt.R:
--------------------------------------------------------------------------------
  1 | ackley <- function(x,y) {
  2 |     -20 * exp(-0.2*sqrt(0.5*(x^2 + y^2))) - exp(0.5*(cos(2*pi*x) + cos(2*pi*y))) + exp(1) + 20
  3 | }
  4 | domain_ackley <- function(){
  5 |     x0 <- runif(1,-5, 5)
  6 |     y0 <- runif(1,-5, 5)
  7 |     return(c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5))
  8 | }
  9 | 
 10 | beale <- function(x, y) {
 11 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 12 | }
 13 | domain_beale <- function(){
 14 |     x0 <- runif(1,-4.5, 4.5)
 15 |     y0 <- runif(1,-4.5, 4.5)
 16 |     return(c(x0 = x0, y0 = y0, xmax = 4.5, xmin = -4.5, ymax = 4.5, ymin = -4.5))
 17 | }
 18 | 
 19 | booth <- function(x, y) {
 20 |     log((x + 2 * y - 7)^2 + (2 * x + y - 5)^2)
 21 | }
 22 | domain_booth <- function(){
 23 |     x0 <- runif(1,-10, 10)
 24 |     y0 <- runif(1,-10, 10)
 25 |     return(c(x0 = x0, y0 = y0, xmax = 10, xmin = -10, ymax = 10, ymin = -10))
 26 | }
 27 | 
 28 | bukin_n6 <- function(x, y) {
 29 |     100 * sqrt(abs(y - 0.01 * x^2)) + 0.01 * abs(x + 10)
 30 | }
 31 | domain_bukin_n6 <- function(){
 32 |     x0 <- runif(1,-15, -5)
 33 |     y0 <- runif(1,-3, 3)
 34 |     return(c(x0 = x0, y0 = y0, xmax = -5, xmin = -15, ymax = -3, ymin = 3))
 35 | }
 36 | 
 37 | easom <- function(x, y) {
 38 |     -cos(x) * cos(y) * exp(-(x - pi)^2 - (y - pi)^2)
 39 | }
 40 | domain_easom <- function(){
 41 |     x0 <- runif(1,-1, 7)
 42 |     y0 <- runif(1,-1, 7)
 43 |     return(c(x0 = x0, y0 = y0, xmax = 7, xmin = -1, ymax = 7, ymin = -1))
 44 | }
 45 | goldstein_price <- function(x, y) {
 46 |     log((1 + (x + y + 1)^2 *
 47 |              (19 - 14 * x + 3 * x^2 - 14 * y + 6 * x * y + 3 * y^2)) *
 48 |             (30 + (2 * x - 3 * y)^2 * (18 - 32 * x + 12 * x^2 + 48 *
 49 |                                            y - 36 * x * y + 27 * y^2)))
 50 | }
 51 | domain_goldstein_price <- function(){
 52 |     x0 <- runif(1,-2, 2)
 53 |     y0 <- runif(1,-3, 1)
 54 |     return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = -3, ymin = 1))
 55 | }
 56 | himmelblau <- function(x, y) {
 57 |     log((x^2 + y - 11)^2 + (x + y^2 - 7)^2)
 58 | }
 59 | domain_himmelblau <- function(){
 60 |     x0 <- runif(1,-5, 5)
 61 |     y0 <- runif(1,-5, 5)
 62 |     return(c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5))
 63 | }
 64 | levi_n13 <- function(x, y) {
 65 |     sin(3 * pi * x)^2 + (x - 1)^2 * (1 + sin(3 * pi * y)^2) +
 66 |         (y - 1)^2 * (1 + sin(2 * pi * y)^2)
 67 | }
 68 | domain_levi_n13 <- function(){
 69 |     x0 <- runif(1,-5, 7)
 70 |     y0 <- runif(1,-5, 7)
 71 |     return(c(x0 = x0, y0 = y0, xmax = 7, xmin = -5, ymax = 7, ymin = -5))
 72 | }
 73 | matyas <- function(x, y) {
 74 |     log(0.26 * (x^2 + y^2) - 0.48 * x * y)
 75 | }
 76 | domain_matyas <- function(){
 77 |     x0 <- runif(1,-10, 10)
 78 |     y0 <- runif(1,-10, 10)
 79 |     return(c(x0 = x0, y0 = y0, xmax = 10, xmin = -10, ymax = 10, ymin = -10))
 80 | }
 81 | rastrigin <- function(x, y) {
 82 |     20 + (x^2 - 10 * cos(2 * pi * x)) + (y^2 - 10 * cos(2 * pi * y))
 83 | }
 84 | domain_rastrigin <- function(){
 85 |     x0 <- runif(1,-5.12, 5.12)
 86 |     y0 <- runif(1,-5.12, 5.12)
 87 |     return(c(x0 = x0, y0 = y0, xmax = 5.12, xmin = -5.12, ymax = 5.12, ymin = -5.12))
 88 | }
 89 | rosenbrock <- function(x, y) {
 90 |     log(100 * (y - x^2)^2 + (1 - x)^2)
 91 | }
 92 | domain_rosenbrock <- function(){
 93 |     x0 <- -2
 94 |     y0 <- 2
 95 |     return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = 3, ymin = -1))
 96 | }
 97 | sphere <- function(x, y) {
 98 |     x^2 + y^2
 99 | }
100 | domain_sphere <- function(){
101 |     x0 <- runif(1,-2, 2)
102 |     y0 <- runif(1,-2, 2)
103 |     return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = 2, ymin = -2))
104 | }
105 | #' @title Test optimization function
106 | #'
107 | #' @name test_optim
108 | #'
109 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
110 | #'
111 | #' @description
112 | #' `test_optim()` function is useful to visualize how optimizers solve the
113 | #' minimization problem by showing the convergence path using a test function.
114 | #' User can choose any test optimization
115 | #' [functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization)
116 | #' provided by `torchopt`:
117 | #'
118 | #' `"beale"`, `"booth"`, `"bukin_n6"`, `"easom"`, `"goldstein_price"`,
119 | #' `"himmelblau"`, `"levi_n13"`, `"matyas"`, `"rastrigin"`,
120 | #' `"rosenbrock"`, and `"sphere"`.
121 | #'
122 | #' Besides these functions, users can pass any function that receives two
123 | #' numerical values and returns a scalar.
124 | #'
125 | #' Optimization functions are useful to evaluate characteristics of optimization
126 | #' algorithms, such as convergence rate, precision, robustness, and performance.
127 | #' These functions give an idea about the different situations that optimization
128 | #' algorithms can face.
129 | #'
130 | #' Function `test_function()` plot the 2D-space of a test optimization function.
131 | #'
132 | #' @param optim          Torch optimizer function.
133 | #' @param ...            Additional parameters (passed to `image` function).
134 | #' @param opt_hparams    A list with optimizer initialization parameters (default: `list()`).
135 | #' If missing, for each optimizer its individual defaults will be used.
136 | #' @param test_fn        A test function (default `"beale"`). You can also pass
137 | #'   a list with 2 elements. The first should be a function that will be optimized
138 | #'   and the second is a function that returns a named vector with `x0`, `y0`
139 | #'   (the starting points) and `xmax`, `xmin`, `ymax` and `ymin` (the domain).
140 | #'   An example: `c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)`
141 | #' @param steps          Number of steps to run (default `200`).
142 | #' @param pt_start_color Starting point color (default `"#5050FF7F"`)
143 | #' @param pt_end_color   Ending point color (default `"#FF5050FF"`)
144 | #' @param ln_color       Line path color (default `"#FF0000FF"`)
145 | #' @param ln_weight      Line path weight (default `2`)
146 | #' @param bg_xy_breaks   Background X and Y resolution (default `100`)
147 | #' @param bg_z_breaks    Background Z resolution (default `32`)
148 | #' @param bg_palette     Background palette (default `"viridis"`)
149 | #' @param ct_levels      Contour levels (default `10`)
150 | #' @param ct_labels      Should show contour labels? (default `FALSE`)
151 | #' @param ct_color       Contour color (default `"#FFFFFF7F"`)
152 | #' @param plot_each_step Should output each step? (default `FALSE`)
153 | #'
154 | #' @return No return value, called for producing animated gifs
155 | #'
156 | #' @export
157 | test_optim <- function(optim, ...,
158 |                        opt_hparams = list(),
159 |                        test_fn = "beale",
160 |                        steps = 200,
161 |                        pt_start_color = "#5050FF7F",
162 |                        pt_end_color = "#FF5050FF",
163 |                        ln_color = "#FF0000FF",
164 |                        ln_weight = 2,
165 |                        bg_xy_breaks = 100,
166 |                        bg_z_breaks = 32,
167 |                        bg_palette = "viridis",
168 |                        ct_levels = 10,
169 |                        ct_labels = FALSE,
170 |                        ct_color = "#FFFFFF7F",
171 |                        plot_each_step = FALSE) {
172 | 
173 |     # pre-conditions
174 |     inherits_from <- if (utils::packageVersion("torch") > '0.7.2') "torch_optimizer_generator" else "function"
175 |     if (!inherits(optim, inherits_from)) {
176 | 
177 |         stop("invalid 'optim' param.", call. = FALSE)
178 |     }
179 |     if (is.character(test_fn)) {
180 |         if (!exists(test_fn,
181 |                     envir = asNamespace("torchopt"),
182 |                     inherits = FALSE)) {
183 |             stop("invalid 'test_fn' param.", call. = FALSE)
184 |         }
185 |         # get starting points
186 |         domain_fn <- get(paste0("domain_",test_fn),
187 |                          envir = asNamespace("torchopt"),
188 |                          inherits = FALSE)
189 |         # get gradient function
190 |         test_fn <- get(test_fn,
191 |                        envir = asNamespace("torchopt"),
192 |                        inherits = FALSE)
193 |     } else if (is.list(test_fn)) {
194 |         domain_fn <- test_fn[[2]]
195 |         test_fn <- test_fn[[1]]
196 |     }
197 | 
198 |     if (!is.function(test_fn)) {
199 |         stop("invalid 'test_fn' param.", call. = FALSE)
200 |     }
201 |     if (!is.function(domain_fn)) {
202 |         stop("missing domain param for function.", call. = FALSE)
203 |     }
204 |     # starting point
205 |     dom <- domain_fn()
206 |     x0 <- dom[["x0"]]
207 |     y0 <- dom[["y0"]]
208 |     # create tensor
209 |     x <- torch::torch_tensor(x0, requires_grad = TRUE)
210 |     y <- torch::torch_tensor(y0, requires_grad = TRUE)
211 | 
212 |     # instantiate optimizer
213 |     optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
214 |     grad_keep <-  FALSE
215 |     if (inherits(optim, "optim_adahessian")) {
216 |         grad_keep <- TRUE
217 |         # retain_graph is not exposed before torch 0.7.2
218 |         if (!utils::packageVersion("torch") > '0.7.2') {
219 |             stop("adahessian needs torch version > 0.7.2, got ",
220 |                  utils::packageVersion("torch"))
221 |         }
222 |     }
223 |     # run optimizer
224 |     x_steps <- numeric(steps)
225 |     y_steps <- numeric(steps)
226 |     for (i in seq_len(steps)) {
227 |         x_steps[i] <- as.numeric(x)
228 |         y_steps[i] <- as.numeric(y)
229 |         optim$zero_grad()
230 |         z <- test_fn(x, y)
231 |         # retain_graph is not exposed before torch 0.7.2
232 |         if (utils::packageVersion("torch") > '0.7.2') {
233 |             z$backward(create_graph = grad_keep, retain_graph = grad_keep)
234 |         } else {
235 |             z$backward(create_graph = grad_keep)
236 |         }
237 |         optim$step()
238 |     }
239 | 
240 |     # prepare plot
241 |     # get xy limits
242 | 
243 |     xmax <- dom[["xmax"]]
244 |     xmin <- dom[["xmin"]]
245 |     ymax <- dom[["ymax"]]
246 |     ymin <- dom[["ymin"]]
247 | 
248 |     # prepare data for gradient plot
249 |     x <- seq(xmin, xmax, length.out = bg_xy_breaks)
250 |     y <- seq(xmin, xmax, length.out = bg_xy_breaks)
251 |     z <- outer(X = x, Y = y, FUN = function(x, y) as.numeric(test_fn(x, y)))
252 | 
253 |     plot_from_step <- steps
254 |     if (plot_each_step) {
255 |         plot_from_step <- 1
256 |     }
257 | 
258 |     for (step in seq(plot_from_step, steps, 1)) {
259 | 
260 |         # plot background
261 |         image(
262 |             x = x,
263 |             y = y,
264 |             z = z,
265 |             col = hcl.colors(
266 |                 n = bg_z_breaks,
267 |                 palette = bg_palette
268 |             ),
269 |             ...
270 |         )
271 | 
272 |         # plot contour
273 |         if (ct_levels > 0) {
274 |             contour(
275 |                 x = x,
276 |                 y = y,
277 |                 z = z,
278 |                 nlevels = ct_levels,
279 |                 drawlabels = ct_labels,
280 |                 col = ct_color,
281 |                 add = TRUE
282 |             )
283 |         }
284 | 
285 |         # plot starting point
286 |         points(
287 |             x_steps[1],
288 |             y_steps[1],
289 |             pch = 21,
290 |             bg = pt_start_color
291 |         )
292 | 
293 |         # plot path line
294 |         lines(
295 |             x_steps[seq_len(step)],
296 |             y_steps[seq_len(step)],
297 |             lwd = ln_weight,
298 |             col = ln_color
299 |         )
300 | 
301 |         # plot end point
302 |         points(
303 |             x_steps[step],
304 |             y_steps[step],
305 |             pch = 21,
306 |             bg = pt_end_color
307 |         )
308 |     }
309 | }
310 | 
311 | 
312 | 


--------------------------------------------------------------------------------
/R/yogi.R:
--------------------------------------------------------------------------------
  1 | #' @title Yogi optimizer
  2 | #'
  3 | #' @name optim_yogi
  4 | #'
  5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
  6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
  7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com}
  8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br}
  9 | #'
 10 | #' @description
 11 | #' R implementation of the Yogi optimizer proposed
 12 | #' by Zaheer et al.(2019). We used the implementation available at
 13 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py.
 14 | #' Thanks to Nikolay Novik for providing the pytorch code.
 15 | #'
 16 | #' The original implementation is licensed using the Apache-2.0 software license.
 17 | #' This implementation is also licensed using Apache-2.0 license.
 18 | #'
 19 | #' From the abstract by the paper by Zaheer et al.(2019):
 20 | #' Adaptive gradient methods that rely on scaling gradients
 21 | #' down by the square root of exponential moving averages
 22 | #' of past squared gradients, such RMSProp, Adam, Adadelta have
 23 | #' found wide application in optimizing the nonconvex problems
 24 | #' that arise in deep learning. However, it has been recently
 25 | #' demonstrated that such methods can fail to converge even
 26 | #' in simple convex optimization settings.
 27 | #' Yogi is a new adaptive optimization algorithm,
 28 | #' which controls the increase in effective learning rate,
 29 | #' leading to even better performance with similar theoretical
 30 | #' guarantees on convergence. Extensive experiments show that
 31 | #' Yogi with very little hyperparameter tuning outperforms
 32 | #' methods such as Adam in several challenging machine learning tasks.
 33 | #'
 34 | #'
 35 | #' @references
 36 | #' Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv Kumar,
 37 | #' "Adaptive Methods for Nonconvex Optimization",
 38 | #' Advances in Neural Information Processing Systems 31 (NeurIPS 2018).
 39 | #' https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization
 40 | #'
 41 | #' @param params         List of parameters to optimize.
 42 | #' @param lr             Learning rate (default: 1e-3)
 43 | #' @param betas          Coefficients computing running averages of gradient
 44 | #'                       and its square (default: (0.9, 0.999))
 45 | #' @param eps            Term added to the denominator to improve numerical
 46 | #'   stability (default: 1e-8)
 47 | #' @param initial_accumulator  Initial values for first and
 48 | #'   second moments.
 49 | #' @param weight_decay   Weight decay (L2 penalty) (default: 0)
 50 | #'
 51 | #' @returns
 52 | #' A torch optimizer object implementing the `step` method.
 53 | #'
 54 | #' @examples
 55 | #' if (torch::torch_is_installed()) {
 56 | 
 57 | #' # function to demonstrate optimization
 58 | #' beale <- function(x, y) {
 59 | #'     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 60 | #'  }
 61 | #' # define optimizer
 62 | #' optim <- torchopt::optim_yogi
 63 | #' # define hyperparams
 64 | #' opt_hparams <- list(lr = 0.01)
 65 | #'
 66 | #' # starting point
 67 | #' x0 <- 3
 68 | #' y0 <- 3
 69 | #' # create tensor
 70 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE)
 71 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE)
 72 | #' # instantiate optimizer
 73 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 74 | #' # run optimizer
 75 | #' steps <- 400
 76 | #' x_steps <- numeric(steps)
 77 | #' y_steps <- numeric(steps)
 78 | #' for (i in seq_len(steps)) {
 79 | #'     x_steps[i] <- as.numeric(x)
 80 | #'     y_steps[i] <- as.numeric(y)
 81 | #'     optim$zero_grad()
 82 | #'     z <- beale(x, y)
 83 | #'     z$backward()
 84 | #'     optim$step()
 85 | #' }
 86 | #' print(paste0("starting value = ", beale(x0, y0)))
 87 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 88 | #' }
 89 | #' @export
 90 | optim_yogi <- torch::optimizer(
 91 |     "optim_yogi",
 92 |     initialize = function(params,
 93 |                           lr = 0.01,
 94 |                           betas = c(0.9, 0.999),
 95 |                           eps = 0.001,
 96 |                           initial_accumulator = 1e-6,
 97 |                           weight_decay = 0) {
 98 |         if (lr <= 0.0)
 99 |             stop("Learning rate must be positive.", call. = FALSE)
100 |         if (eps < 0.0)
101 |             stop("eps must be non-negative.", call. = FALSE)
102 |         if (betas[1] > 1.0 | betas[1] <= 0.0)
103 |             stop("Invalid beta parameter.", call. = FALSE)
104 |         if (betas[2] > 1.0 | betas[1] <= 0.0)
105 |             stop("Invalid beta parameter.", call. = FALSE)
106 |         if (weight_decay < 0)
107 |             stop("Invalid weight_decay value.", call. = FALSE)
108 | 
109 |         defaults = list(
110 |             lr                  = lr,
111 |             betas               = betas,
112 |             eps                 = eps,
113 |             weight_decay        = weight_decay,
114 |             initial_accumulator = initial_accumulator
115 |         )
116 |         super$initialize(params, defaults)
117 |     },
118 |     step = function(closure = NULL) {
119 |         loop_fun <- function(group, param, g, p) {
120 |             if (is.null(param$grad))
121 |                 next
122 |             grad <- param$grad
123 | 
124 |             # get value of initial accumulator
125 |             init_acc <- group[["initial_accumulator"]]
126 | 
127 |             # State initialization
128 |             if (length(state(param)) == 0) {
129 |                 state(param) <- list()
130 |                 state(param)[["step"]] <- 0
131 |                 # Exponential moving average of gradient values
132 |                 state(param)[["exp_avg"]] <- torch::nn_init_constant_(
133 |                     torch::torch_empty_like(
134 |                         param,
135 |                         memory_format = torch::torch_preserve_format()
136 |                     ),
137 |                     init_acc
138 |                 )
139 |                 # Exponential moving average of squared gradient values
140 |                 state(param)[["exp_avg_sq"]] <- torch::nn_init_constant_(
141 |                     torch::torch_empty_like(
142 |                         param,
143 |                         memory_format = torch::torch_preserve_format()
144 |                     ),
145 |                     init_acc
146 |                 )
147 |             }
148 |             # Define variables for optimization function
149 |             exp_avg      <- state(param)[["exp_avg"]]
150 |             exp_avg_sq   <- state(param)[["exp_avg_sq"]]
151 |             beta1        <- group[['betas']][[1]]
152 |             beta2        <- group[['betas']][[2]]
153 |             weight_decay <- group[['weight_decay']]
154 |             eps          <- group[["eps"]]
155 |             lr           <- group[['lr']]
156 | 
157 |             # take one step
158 |             state(param)[["step"]] <- state(param)[["step"]] + 1
159 |             # bias correction
160 |             bias_correction1 <- 1 - beta1^state(param)[['step']]
161 |             bias_correction2 <- 1 - beta2^state(param)[['step']]
162 | 
163 |             # L2 correction
164 |             if (weight_decay != 0)
165 |                 grad <- grad$add(p, alpha = weight_decay)
166 | 
167 |             # Decay the first moment
168 |             exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1)
169 |             # Decay the second moment
170 |             grad_squared <- grad$mul(grad)
171 |             exp_avg_sq$addcmul_(
172 |                 torch::torch_sign(exp_avg_sq - grad_squared),
173 |                 grad_squared,
174 |                 value = -(1 - beta2)
175 |             )
176 | 
177 |             # calculate denominator
178 |             denom = (exp_avg_sq$sqrt() / sqrt(bias_correction2))$add_(eps)
179 | 
180 |             # calculate step size
181 |             step_size <- lr / bias_correction1
182 |             # go to next step
183 |             param$addcdiv_(exp_avg, denom, value = -step_size)
184 |         }
185 | 
186 |         private$step_helper(closure, loop_fun)
187 |     }
188 | )
189 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | editor_options: 
  4 |   chunk_output_type: console
  5 |   markdown: 
  6 |     wrap: 72
  7 | ---
  8 | 
  9 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 10 | 
 11 | ```{r, include = FALSE}
 12 | knitr::opts_chunk$set(
 13 |   collapse = TRUE,
 14 |   comment = "#>",
 15 |   fig.path = "man/figures/README-",
 16 |   out.width = "100%"
 17 | )
 18 | ```
 19 | 
 20 | # torchopt
 21 | 
 22 | <!-- badges: start -->
 23 | 
 24 | [![R-CMD-check](https://github.com/e-sensing/torchopt/workflows/R-CMD-check/badge.svg)](https://github.com/e-sensing/torchopt/actions)
 25 | [![CRAN
 26 | status](https://www.r-pkg.org/badges/version/torchopt)](https://cran.r-project.org/package=torchopt)
 27 | [![Software Life
 28 | Cycle](https://img.shields.io/badge/lifecycle-experimental-yellow.svg)](https://lifecycle.r-lib.org/articles/stages.html)
 29 | [![Software
 30 | License](https://img.shields.io/badge/license-Apache%202-2--green)](https://www.apache.org/licenses/LICENSE-2.0)
 31 | 
 32 | <!-- badges: end -->
 33 | 
 34 | The `torchopt` package provides R implementation of deep learning optimizers proposed in the literature. It is intended to support the use of the torch package in R.
 35 | 
 36 | ## Installation
 37 | 
 38 | Installing the CRAN (stable) version of `torchopt`:
 39 | 
 40 | ```{r, eval = FALSE}
 41 | install.packages("torchopt")
 42 | ```
 43 | 
 44 | Installing the development version of `torchopt` do as :
 45 | 
 46 | ```{r, eval = FALSE}
 47 | library(devtools)
 48 | install_github("e-sensing/torchopt")
 49 | ```
 50 | 
 51 | ```{r, echo = FALSE}
 52 | library(torch)
 53 | if (!torch::torch_is_installed())
 54 |     torch::install_torch()
 55 | library(torchopt)
 56 | ```
 57 | 
 58 | ## Provided optimizers
 59 | 
 60 | `torchopt` package provides the following R implementations of torch
 61 | optimizers:
 62 | 
 63 | -   `optim_adamw()`: AdamW optimizer proposed by Loshchilov & Hutter
 64 |     (2019). Converted from the `pytorch` code developed by Collin
 65 |     Donahue-Oponski available at
 66 |     <https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3>
 67 | 
 68 | -   `optim_adabelief()`: Adabelief optimizer proposed by Zhuang et al
 69 |     (2020). Converted from the authors' PyTorch code:
 70 |     <https://github.com/juntang-zhuang/Adabelief-Optimizer>.
 71 | 
 72 | -   `optim_adabound()`: Adabound optimizer proposed by Luo et al.(2019).
 73 |     Converted from the authors' PyTorch code:
 74 |     <https://github.com/Luolc/AdaBound>.
 75 |     
 76 | -   `optim_adahessian()`: Adahessian optimizer proposed by Yao et al.(2021).
 77 |     Converted from the authors' PyTorch code:
 78 |     <https://github.com/amirgholami>.
 79 | 
 80 | -   `optim_madgrad()`: Momentumized, Adaptive, Dual Averaged Gradient
 81 |     Method for Stochastic Optimization (MADGRAD) optimizer proposed by
 82 |     Defazio & Jelassi (2021). The function is imported from
 83 |     [madgrad](https://CRAN.R-project.org/package=madgrad) package and
 84 |     the source code is available at <https://github.com/mlverse/madgrad>
 85 | 
 86 | -   `optim_nadam()`: Incorporation of Nesterov Momentum into Adam
 87 |     proposed by Dozat (2016). Converted from the PyTorch site
 88 |     <https://github.com/pytorch/pytorch>.
 89 | 
 90 | -   `optim_qhadam()`: Quasi-hyperbolic version of Adam proposed by Ma
 91 |     and Yarats(2019). Converted from the code developed by Meta AI:
 92 |     <https://github.com/facebookresearch/qhoptim>.
 93 | 
 94 | -   `optim_radam()`: Rectified verison of Adam proposed by Liu et al.
 95 |     (2019). Converted from the PyTorch code 
 96 |     <https://github.com/pytorch/pytorch>.
 97 |     
 98 | -   `optim_swats()`: Optimizer that switches from Adam to SGD proposed by 
 99 |     Keskar and Socher(2018). 
100 |     Converted from  the `pytorch` code developed by Patrik Purgai:
101 |     <https://github.com/Mrpatekful/swats>
102 |     
103 | -   `optim_yogi()`: Yogi optimizer proposed by Zaheer et al.(2019).
104 |      Converted from  the `pytorch` code developed by Nikolay Novik:
105 |     <https://github.com/jettify/pytorch-optimizer>
106 | 
107 | ## Optimization test functions
108 | 
109 | You can also test optimizers using optimization [test
110 | functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization)
111 | provided by `torchopt` including `"ackley"`, `"beale"`, `"booth"`,
112 | `"bukin_n6"`, `"easom"`, `"goldstein_price"`, `"himmelblau"`,
113 | `"levi_n13"`, `"matyas"`, `"rastrigin"`, `"rosenbrock"`, `"sphere"`.
114 | Optimization functions are useful to evaluate characteristics of
115 | optimization algorithms, such as convergence rate, precision,
116 | robustness, and performance. These functions give an idea about the
117 | different situations that optimization algorithms can face.
118 | 
119 | In what follows, we perform tests using `"beale"` test function. To
120 | visualize an animated GIF, we set `plot_each_step=TRUE` and capture each
121 | step frame using [gifski](https://CRAN.R-project.org/package=gifski)
122 | package.
123 | 
124 | ### `optim_adamw()`:
125 | 
126 | ```{r test_adamw, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
127 | 
128 | # test optim adamw
129 | set.seed(12345)
130 | torchopt::test_optim(
131 |     optim = torchopt::optim_adamw,
132 |     test_fn = "beale",
133 |     opt_hparams = list(lr = 0.1),
134 |     steps = 500,
135 |     plot_each_step = TRUE
136 | )
137 | 
138 | ```
139 | 
140 | ### `optim_adabelief()`:
141 | 
142 | ```{r test_adabelief, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
143 | 
144 | set.seed(42)
145 | test_optim(
146 |     optim = optim_adabelief,
147 |     opt_hparams = list(lr = 0.5),
148 |     steps = 400,
149 |     test_fn = "beale",
150 |     plot_each_step = TRUE
151 | )
152 | ```
153 | 
154 | ### `optim_adabound()`:
155 | 
156 | ```{r test_adabound, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
157 | 
158 | # set manual seed
159 | set.seed(22)
160 | test_optim(
161 |     optim = optim_adabound,
162 |     opt_hparams = list(lr = 0.5),
163 |     steps = 400,
164 |     test_fn = "beale",
165 |     plot_each_step = TRUE
166 | )
167 | 
168 | ```
169 | 
170 | ### `optim_adahessian()`:
171 | 
172 | ```{r test_adahessian, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
173 | 
174 | # set manual seed
175 | set.seed(290356)
176 | test_optim(
177 |     optim = optim_adahessian,
178 |     opt_hparams = list(lr = 0.2),
179 |     steps = 500,
180 |     test_fn = "beale",
181 |     plot_each_step = TRUE
182 | )
183 | 
184 | ```
185 | 
186 | ### `optim_madgrad()`:
187 | 
188 | ```{r test_madgrad, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
189 | 
190 | set.seed(256)
191 | test_optim(
192 |     optim = optim_madgrad,
193 |     opt_hparams = list(lr = 0.05),
194 |     steps = 400,
195 |     test_fn = "beale",
196 |     plot_each_step = TRUE
197 | )
198 | 
199 | ```
200 | 
201 | ### `optim_nadam()`:
202 | 
203 | ```{r test_nadam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
204 | 
205 | set.seed(2903)
206 | test_optim(
207 |     optim = optim_nadam,
208 |     opt_hparams = list(lr = 0.5, weight_decay = 0),
209 |     steps = 500,
210 |     test_fn = "beale",
211 |     plot_each_step = TRUE
212 | )
213 | 
214 | ```
215 | 
216 | ### `optim_qhadam()`:
217 | 
218 | ```{r test_qhadam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
219 | 
220 | set.seed(1024)
221 | test_optim(
222 |     optim = optim_qhadam,
223 |     opt_hparams = list(lr = 0.1),
224 |     steps = 500,
225 |     test_fn = "beale",
226 |     plot_each_step = TRUE
227 | )
228 | 
229 | ```
230 | 
231 | 
232 | ### `optim_radam()`:
233 | 
234 | ```{r test_radam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
235 | 
236 | set.seed(1024)
237 | test_optim(
238 |     optim = optim_radam,
239 |     opt_hparams = list(lr = 1.0),
240 |     steps = 500,
241 |     test_fn = "beale",
242 |     plot_each_step = TRUE
243 | )
244 | 
245 | ```
246 | 
247 | 
248 | ### `optim_swats()`:
249 | 
250 | ```{r test_swats, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
251 | 
252 | set.seed(234)
253 | test_optim(
254 |     optim = optim_swats,
255 |     opt_hparams = list(lr = 0.5),
256 |     steps = 500,
257 |     test_fn = "beale",
258 |     plot_each_step = TRUE
259 | )
260 | 
261 | ```
262 | 
263 | ### `optim_yogi()`:
264 | 
265 | ```{r test_yogi, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE}
266 | 
267 | # set manual seed
268 | set.seed(66)
269 | test_optim(
270 |     optim = optim_yogi,
271 |     opt_hparams = list(lr = 0.1),
272 |     steps = 500,
273 |     test_fn = "beale",
274 |     plot_each_step = TRUE
275 | )
276 | 
277 | ```
278 | 
279 | ## Acknowledgements
280 | 
281 | We are thankful to Collin Donahue-Oponski <https://github.com/colllin>,
282 | Amir Gholami <https://github.com/amirgholami>, 
283 | Liangchen Luo <https://github.com/Luolc>, Liyuan Liu
284 | <https://github.com/LiyuanLucasLiu>, Nikolay Novik <https://github.com/jettify>, Patrik Purgai <https://github.com/Mrpatekful> Juntang Zhuang <https://github.com/juntang-zhuang> and the PyTorch team  <https://github.com/pytorch/pytorch> for providing pytorch code for the optimizers implemented in this package. We also thank Daniel Falbel <https://github.com/dfalbel> for providing support
285 | for the R version of PyTorch.
286 | 
287 | ## Code of Conduct
288 | 
289 | The torchopt project is released with a [Contributor
290 | Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
291 | By contributing to this project, you agree to abide by its terms.
292 | 
293 | ## References
294 | 
295 | -   ADABELIEF: Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda, Nicha
296 |     Dvornek, Xenophon Papademetris, James S. Duncan. "Adabelief
297 |     Optimizer: Adapting Stepsizes by the Belief in Observed Gradients",
298 |     34th Conference on Neural Information Processing Systems (NeurIPS
299 |     2020), <https://arxiv.org/abs/2010.07468>.
300 | 
301 | -   ADABOUND: Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, "Adaptive Gradient
302 |     Methods with Dynamic Bound of Learning Rate", International
303 |     Conference on Learning Representations (ICLR), 2019.
304 |     <https://doi.org/10.48550/arXiv.1902.09843>.
305 |     
306 | -   ADAHESSIAN: Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa, Kurt Keutzer, 
307 |     Michael W. Mahoney. "Adahessian: An Adaptive Second Order Optimizer 
308 |     for Machine Learning", AAAI Conference on Artificial Intelligence, 35(12),
309 |     10665-10673, 2021. <https://arxiv.org/abs/2006.00719>.
310 |     
311 | -   ADAMW: Ilya Loshchilov, Frank Hutter, "Decoupled Weight Decay
312 |     Regularization", International Conference on Learning
313 |     Representations (ICLR) 2019.
314 |     <https://doi.org/10.48550/arXiv.1711.05101>.
315 | 
316 | -   MADGRAD: Aaron Defazio, Samy Jelassi, "Adaptivity without Compromise: A
317 |     Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
318 |     Optimization", arXiv preprint arXiv:2101.11075, 2021.
319 |     <https://doi.org/10.48550/arXiv.2101.11075>
320 |     
321 | -   NADAM: Timothy Dazat, "Incorporating Nesterov Momentum into Adam",
322 |     International Conference on Learning Representations (ICLR), 2019.
323 |     <https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf>
324 |     
325 | -   QHADAM: Jerry Ma, Denis Yarats, "Quasi-hyperbolic momentum and Adam 
326 |     for deep learning". <https://arxiv.org/abs/1810.06801>
327 | 
328 | -   RADAM: Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu,
329 |     Jianfeng Gao, Jiawei Han, "On the Variance of the Adaptive Learning
330 |     Rate and Beyond", International Conference on Learning
331 |     Representations (ICLR) 2020. <https://arxiv.org/abs/1908.03265>.
332 |     
333 | -   SWATS: Nitish Keskar, Richard Socher, "Improving Generalization Performance 
334 |     by Switching from Adam to SGD". 
335 |     International Conference on Learning Representations (ICLR), 2018.
336 |     <https://arxiv.org/abs/1712.07628>.
337 |     
338 | -   YOGI: Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv
339 |     Kumar, "Adaptive Methods for Nonconvex Optimization", Advances in
340 |     Neural Information Processing Systems 31 (NeurIPS 2018).
341 |     <https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>
342 | 
343 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # torchopt
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/e-sensing/torchopt/workflows/R-CMD-check/badge.svg)](https://github.com/e-sensing/torchopt/actions)
  9 | [![CRAN
 10 | status](https://www.r-pkg.org/badges/version/torchopt)](https://cran.r-project.org/package=torchopt)
 11 | [![Software Life
 12 | Cycle](https://img.shields.io/badge/lifecycle-experimental-yellow.svg)](https://lifecycle.r-lib.org/articles/stages.html)
 13 | [![Software
 14 | License](https://img.shields.io/badge/license-Apache%202-2--green)](https://www.apache.org/licenses/LICENSE-2.0)
 15 | 
 16 | <!-- badges: end -->
 17 | 
 18 | The `torchopt` package provides R implementation of deep learning
 19 | optimizers proposed in the literature. It is intended to support the use
 20 | of the torch package in R.
 21 | 
 22 | ## Installation
 23 | 
 24 | Installing the CRAN (stable) version of `torchopt`:
 25 | 
 26 | ``` r
 27 | install.packages("torchopt")
 28 | ```
 29 | 
 30 | Installing the development version of `torchopt` do as :
 31 | 
 32 | ``` r
 33 | library(devtools)
 34 | install_github("e-sensing/torchopt")
 35 | ```
 36 | 
 37 |     #> Warning: package 'torch' was built under R version 4.1.3
 38 | 
 39 | ## Provided optimizers
 40 | 
 41 | `torchopt` package provides the following R implementations of torch
 42 | optimizers:
 43 | 
 44 | -   `optim_adamw()`: AdamW optimizer proposed by Loshchilov & Hutter
 45 |     (2019). Converted from the `pytorch` code developed by Collin
 46 |     Donahue-Oponski available at
 47 |     <https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3>
 48 | 
 49 | -   `optim_adabelief()`: Adabelief optimizer proposed by Zhuang et al
 50 |     (2020). Converted from the authors’ PyTorch code:
 51 |     <https://github.com/juntang-zhuang/Adabelief-Optimizer>.
 52 | 
 53 | -   `optim_adabound()`: Adabound optimizer proposed by Luo et al.(2019).
 54 |     Converted from the authors’ PyTorch code:
 55 |     <https://github.com/Luolc/AdaBound>.
 56 | 
 57 | -   `optim_adahessian()`: Adahessian optimizer proposed by Yao et
 58 |     al.(2021). Converted from the authors’ PyTorch code:
 59 |     <https://github.com/amirgholami>.
 60 | 
 61 | -   `optim_madgrad()`: Momentumized, Adaptive, Dual Averaged Gradient
 62 |     Method for Stochastic Optimization (MADGRAD) optimizer proposed by
 63 |     Defazio & Jelassi (2021). The function is imported from
 64 |     [madgrad](https://CRAN.R-project.org/package=madgrad) package and
 65 |     the source code is available at <https://github.com/mlverse/madgrad>
 66 | 
 67 | -   `optim_nadam()`: Incorporation of Nesterov Momentum into Adam
 68 |     proposed by Dozat (2016). Converted from the PyTorch site
 69 |     <https://github.com/pytorch/pytorch>.
 70 | 
 71 | -   `optim_qhadam()`: Quasi-hyperbolic version of Adam proposed by Ma
 72 |     and Yarats(2019). Converted from the code developed by Meta AI:
 73 |     <https://github.com/facebookresearch/qhoptim>.
 74 | 
 75 | -   `optim_radam()`: Rectified verison of Adam proposed by Liu et al.
 76 |     (2019). Converted from the PyTorch code
 77 |     <https://github.com/pytorch/pytorch>.
 78 | 
 79 | -   `optim_swats()`: Optimizer that switches from Adam to SGD proposed
 80 |     by Keskar and Socher(2018). Converted from the `pytorch` code
 81 |     developed by Patrik Purgai: <https://github.com/Mrpatekful/swats>
 82 | 
 83 | -   `optim_yogi()`: Yogi optimizer proposed by Zaheer et al.(2019).
 84 |     Converted from the `pytorch` code developed by Nikolay Novik:
 85 |     <https://github.com/jettify/pytorch-optimizer>
 86 | 
 87 | ## Optimization test functions
 88 | 
 89 | You can also test optimizers using optimization [test
 90 | functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization)
 91 | provided by `torchopt` including `"ackley"`, `"beale"`, `"booth"`,
 92 | `"bukin_n6"`, `"easom"`, `"goldstein_price"`, `"himmelblau"`,
 93 | `"levi_n13"`, `"matyas"`, `"rastrigin"`, `"rosenbrock"`, `"sphere"`.
 94 | Optimization functions are useful to evaluate characteristics of
 95 | optimization algorithms, such as convergence rate, precision,
 96 | robustness, and performance. These functions give an idea about the
 97 | different situations that optimization algorithms can face.
 98 | 
 99 | In what follows, we perform tests using `"beale"` test function. To
100 | visualize an animated GIF, we set `plot_each_step=TRUE` and capture each
101 | step frame using [gifski](https://CRAN.R-project.org/package=gifski)
102 | package.
103 | 
104 | ### `optim_adamw()`:
105 | 
106 | ``` r
107 | # test optim adamw
108 | set.seed(12345)
109 | torchopt::test_optim(
110 |     optim = torchopt::optim_adamw,
111 |     test_fn = "beale",
112 |     opt_hparams = list(lr = 0.1),
113 |     steps = 500,
114 |     plot_each_step = TRUE
115 | )
116 | ```
117 | 
118 | <img src="man/figures/README-test_adamw-.gif" width="50%" height="50%" />
119 | 
120 | ### `optim_adabelief()`:
121 | 
122 | ``` r
123 | set.seed(42)
124 | test_optim(
125 |     optim = optim_adabelief,
126 |     opt_hparams = list(lr = 0.5),
127 |     steps = 400,
128 |     test_fn = "beale",
129 |     plot_each_step = TRUE
130 | )
131 | ```
132 | 
133 | <img src="man/figures/README-test_adabelief-.gif" width="50%" height="50%" />
134 | 
135 | ### `optim_adabound()`:
136 | 
137 | ``` r
138 | # set manual seed
139 | set.seed(22)
140 | test_optim(
141 |     optim = optim_adabound,
142 |     opt_hparams = list(lr = 0.5),
143 |     steps = 400,
144 |     test_fn = "beale",
145 |     plot_each_step = TRUE
146 | )
147 | ```
148 | 
149 | <img src="man/figures/README-test_adabound-.gif" width="50%" height="50%" />
150 | 
151 | ### `optim_adahessian()`:
152 | 
153 | ``` r
154 | # set manual seed
155 | set.seed(290356)
156 | test_optim(
157 |     optim = optim_adahessian,
158 |     opt_hparams = list(lr = 0.2),
159 |     steps = 500,
160 |     test_fn = "beale",
161 |     plot_each_step = TRUE
162 | )
163 | ```
164 | 
165 | <img src="man/figures/README-test_adahessian-.gif" width="50%" height="50%" />
166 | 
167 | ### `optim_madgrad()`:
168 | 
169 | ``` r
170 | set.seed(256)
171 | test_optim(
172 |     optim = optim_madgrad,
173 |     opt_hparams = list(lr = 0.05),
174 |     steps = 400,
175 |     test_fn = "beale",
176 |     plot_each_step = TRUE
177 | )
178 | ```
179 | 
180 | <img src="man/figures/README-test_madgrad-.gif" width="50%" height="50%" />
181 | 
182 | ### `optim_nadam()`:
183 | 
184 | ``` r
185 | set.seed(2903)
186 | test_optim(
187 |     optim = optim_nadam,
188 |     opt_hparams = list(lr = 0.5, weight_decay = 0),
189 |     steps = 500,
190 |     test_fn = "beale",
191 |     plot_each_step = TRUE
192 | )
193 | ```
194 | 
195 | <img src="man/figures/README-test_nadam-.gif" width="50%" height="50%" />
196 | 
197 | ### `optim_qhadam()`:
198 | 
199 | ``` r
200 | set.seed(1024)
201 | test_optim(
202 |     optim = optim_qhadam,
203 |     opt_hparams = list(lr = 0.1),
204 |     steps = 500,
205 |     test_fn = "beale",
206 |     plot_each_step = TRUE
207 | )
208 | ```
209 | 
210 | <img src="man/figures/README-test_qhadam-.gif" width="50%" height="50%" />
211 | 
212 | ### `optim_radam()`:
213 | 
214 | ``` r
215 | set.seed(1024)
216 | test_optim(
217 |     optim = optim_radam,
218 |     opt_hparams = list(lr = 1.0),
219 |     steps = 500,
220 |     test_fn = "beale",
221 |     plot_each_step = TRUE
222 | )
223 | ```
224 | 
225 | <img src="man/figures/README-test_radam-.gif" width="50%" height="50%" />
226 | 
227 | ### `optim_swats()`:
228 | 
229 | ``` r
230 | set.seed(234)
231 | test_optim(
232 |     optim = optim_swats,
233 |     opt_hparams = list(lr = 0.5),
234 |     steps = 500,
235 |     test_fn = "beale",
236 |     plot_each_step = TRUE
237 | )
238 | ```
239 | 
240 | <img src="man/figures/README-test_swats-.gif" width="50%" height="50%" />
241 | 
242 | ### `optim_yogi()`:
243 | 
244 | ``` r
245 | # set manual seed
246 | set.seed(66)
247 | test_optim(
248 |     optim = optim_yogi,
249 |     opt_hparams = list(lr = 0.1),
250 |     steps = 500,
251 |     test_fn = "beale",
252 |     plot_each_step = TRUE
253 | )
254 | ```
255 | 
256 | <img src="man/figures/README-test_yogi-.gif" width="50%" height="50%" />
257 | 
258 | ## Acknowledgements
259 | 
260 | We are thankful to Collin Donahue-Oponski <https://github.com/colllin>,
261 | Amir Gholami <https://github.com/amirgholami>, Liangchen Luo
262 | <https://github.com/Luolc>, Liyuan Liu
263 | <https://github.com/LiyuanLucasLiu>, Nikolay Novik
264 | <https://github.com/jettify>, Patrik Purgai
265 | <https://github.com/Mrpatekful> Juntang Zhuang
266 | <https://github.com/juntang-zhuang> and the PyTorch team
267 | <https://github.com/pytorch/pytorch> for providing pytorch code for the
268 | optimizers implemented in this package. We also thank Daniel Falbel
269 | <https://github.com/dfalbel> for providing support for the R version of
270 | PyTorch.
271 | 
272 | ## Code of Conduct
273 | 
274 | The torchopt project is released with a [Contributor Code of
275 | Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
276 | By contributing to this project, you agree to abide by its terms.
277 | 
278 | ## References
279 | 
280 | -   ADABELIEF: Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda,
281 |     Nicha Dvornek, Xenophon Papademetris, James S. Duncan. “Adabelief
282 |     Optimizer: Adapting Stepsizes by the Belief in Observed Gradients”,
283 |     34th Conference on Neural Information Processing Systems (NeurIPS
284 |     2020), <https://arxiv.org/abs/2010.07468>.
285 | 
286 | -   ADABOUND: Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, “Adaptive
287 |     Gradient Methods with Dynamic Bound of Learning Rate”, International
288 |     Conference on Learning Representations (ICLR), 2019.
289 |     <https://doi.org/10.48550/arXiv.1902.09843>.
290 | 
291 | -   ADAHESSIAN: Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa,
292 |     Kurt Keutzer, Michael W. Mahoney. “Adahessian: An Adaptive Second
293 |     Order Optimizer for Machine Learning”, AAAI Conference on Artificial
294 |     Intelligence, 35(12), 10665-10673, 2021.
295 |     <https://arxiv.org/abs/2006.00719>.
296 | 
297 | -   ADAMW: Ilya Loshchilov, Frank Hutter, “Decoupled Weight Decay
298 |     Regularization”, International Conference on Learning
299 |     Representations (ICLR) 2019.
300 |     <https://doi.org/10.48550/arXiv.1711.05101>.
301 | 
302 | -   MADGRAD: Aaron Defazio, Samy Jelassi, “Adaptivity without
303 |     Compromise: A Momentumized, Adaptive, Dual Averaged Gradient Method
304 |     for Stochastic Optimization”, arXiv preprint arXiv:2101.11075, 2021.
305 |     <https://doi.org/10.48550/arXiv.2101.11075>
306 | 
307 | -   NADAM: Timothy Dazat, “Incorporating Nesterov Momentum into Adam”,
308 |     International Conference on Learning Representations (ICLR), 2019.
309 |     <https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf>
310 | 
311 | -   QHADAM: Jerry Ma, Denis Yarats, “Quasi-hyperbolic momentum and Adam
312 |     for deep learning”. <https://arxiv.org/abs/1810.06801>
313 | 
314 | -   RADAM: Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen,
315 |     Xiaodong Liu, Jianfeng Gao, Jiawei Han, “On the Variance of the
316 |     Adaptive Learning Rate and Beyond”, International Conference on
317 |     Learning Representations (ICLR) 2020.
318 |     <https://arxiv.org/abs/1908.03265>.
319 | 
320 | -   SWATS: Nitish Keskar, Richard Socher, “Improving Generalization
321 |     Performance by Switching from Adam to SGD”. International Conference
322 |     on Learning Representations (ICLR), 2018.
323 |     <https://arxiv.org/abs/1712.07628>.
324 | 
325 | -   YOGI: Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale,
326 |     Sanjiv Kumar, “Adaptive Methods for Nonconvex Optimization”,
327 |     Advances in Neural Information Processing Systems 31 (NeurIPS 2018).
328 |     <https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>
329 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/inst/WORDLIST:
--------------------------------------------------------------------------------
  1 | Acknowledgements
  2 | adabelief
  3 | Adabelief
  4 | AdaBelief
  5 | AdaBound
  6 | Adabound
  7 | adabound
  8 | Adadelta
  9 | Adagrad
 10 | AdamW
 11 | adamw
 12 | Adaptivity
 13 | authors’
 14 | Beale
 15 | CMD
 16 | Codecov
 17 | Dazat
 18 | Dozat
 19 | Defazio
 20 | Devendra
 21 | Dvornek
 22 | dfalbel
 23 | Falbel
 24 | Gao
 25 | Golay
 26 | Grosso
 27 | Guestrin
 28 | Guolin
 29 | Haoming
 30 | Hassan
 31 | Herold
 32 | Hijmans
 33 | Hutter
 34 | ICLR
 35 | Idoumghar
 36 | Ilya
 37 | IJCNN
 38 | IKI
 39 | Ilya
 40 | INPE
 41 | ISPRS
 42 | ISSN
 43 | isprsjprs
 44 | Jaccard
 45 | Jianfeng
 46 | Jiang
 47 | Jiawei
 48 | Jelassi
 49 | jenks
 50 | Jenks
 51 | Jordao
 52 | JSON
 53 | JSTARS
 54 | juntang
 55 | Juntang
 56 | Kaggle
 57 | Kingma
 58 | KDD
 59 | Ke
 60 | Kegelmeyer
 61 | keras
 62 | Keras
 63 | kganz
 64 | kmeans
 65 | kohonen
 66 | Kohonen
 67 | Körner
 68 | Korner
 69 | Kruisselbrink
 70 | Kumar
 71 | Landrieu
 72 | landsat
 73 | latlong
 74 | Lhassane
 75 | LKP
 76 | LLKP
 77 | Keskar
 78 | Liangchen
 79 | LightGBM
 80 | Liu
 81 | Liyuan
 82 | LiyuanLucasLiu
 83 | Loic
 84 | Loshchilov
 85 | LSTM
 86 | LTAE
 87 | Lubia
 88 | Luo
 89 | Luolc
 90 | LUCC
 91 | luz
 92 | LZW
 93 | MADGRAD
 94 | MLP
 95 | madgrad
 96 | maja
 97 | mapview
 98 | Maja
 99 | Magrittr
100 | Manzil
101 | Mato
102 | Mattias
103 | Maus
104 | MODIS
105 | MSPC
106 | Maximage
107 | Meng
108 | Mohr
109 | mlr
110 | Momentumized
111 | Mrpatekful
112 | Nadam
113 | Nesrine
114 | nadam
115 | Nesterov
116 | Nesterov’s
117 | NeurIPS
118 | Nicha
119 | Nikolay
120 | Nitish
121 | Nonconvex
122 | Novik
123 | Oponski
124 | openreview
125 | Patrik
126 | Papademetris
127 | Pengcheng
128 | Purgai
129 | QH
130 | qhadam
131 | QHAdam
132 | qhoptim
133 | RAdam
134 | radam
135 | RMSProp
136 | RMSprop
137 | Reddi
138 | SGD
139 | Sachan
140 | Samy
141 | Sanjiv
142 | Sashank
143 | Satyen
144 | Sekhar
145 | Shekar
146 | Shirish
147 | Sochee
148 | Socher
149 | warmup
150 | Tatikonda
151 | Xiong
152 | Xiaodong
153 | Xu
154 | Weizhu
155 | Yan
156 | Yao
157 | Yarats
158 | Yifan
159 | Yuanhao
160 | Zaheer
161 | zhuang
162 | Zhuang
163 | ZJjtNEZ
164 | al
165 | arXiv
166 | arxiv
167 | bff
168 | colllin
169 | doi
170 | et
171 | facebookresearch
172 | gifski
173 | github
174 | gmail
175 | grDevices
176 | grey
177 | GTiff
178 | headtails
179 | hcl
180 | HCL
181 | hclust
182 | hotfix
183 | Hotfix
184 | href
185 | http
186 | https
187 | io
188 | ir
189 | inequivalence
190 | interpolator
191 | iteratively
192 | jettify
193 | labelled
194 | labelling
195 | licence
196 | lineshape
197 | lintr
198 | logref
199 | lon
200 | lr
201 | lubridate
202 | mem
203 | memsize
204 | metatype
205 | msg
206 | mth
207 | mlverse
208 | multiclass
209 | multilayer
210 | multinom
211 | MULTIPOLYGON
212 | Nacional
213 | NatNonForest
214 | neighbourhood
215 | neighbours
216 | NDVI
217 | ndvi
218 | ncols
219 | nonconvex
220 | optimizers
221 | Optimizers
222 | preprint
223 | py
224 | pytorch
225 | rescaled
226 | th
227 | verison
228 | viridis
229 | wikipedia
230 | 


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-.gif


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-1.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-10.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-2.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-3.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-4.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-5.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-6.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-6.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-7.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-7.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-8.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-8.jpeg


--------------------------------------------------------------------------------
/man/figures/README-chunk-label-9.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-9.jpeg


--------------------------------------------------------------------------------
/man/figures/README-gif_opt-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-gif_opt-.gif


--------------------------------------------------------------------------------
/man/figures/README-opt_fun-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-opt_fun-1.png


--------------------------------------------------------------------------------
/man/figures/README-pressure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-pressure-1.png


--------------------------------------------------------------------------------
/man/figures/README-test_adabelief-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adabelief-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_adabound-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adabound-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_adahessian-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adahessian-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_adamw-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adamw-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_madgrad-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_madgrad-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_nadam-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_nadam-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_qhadam-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_qhadam-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_radam-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_radam-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_swats-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_swats-.gif


--------------------------------------------------------------------------------
/man/figures/README-test_yogi-.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_yogi-.gif


--------------------------------------------------------------------------------
/man/optim_adabelief.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/adabelief.R
  3 | \name{optim_adabelief}
  4 | \alias{optim_adabelief}
  5 | \title{Adabelief optimizer}
  6 | \usage{
  7 | optim_adabelief(
  8 |   params,
  9 |   lr = 0.001,
 10 |   betas = c(0.9, 0.999),
 11 |   eps = 1e-08,
 12 |   weight_decay = 1e-06,
 13 |   weight_decouple = TRUE,
 14 |   fixed_decay = FALSE,
 15 |   rectify = TRUE
 16 | )
 17 | }
 18 | \arguments{
 19 | \item{params}{List of parameters to optimize.}
 20 | 
 21 | \item{lr}{Learning rate (default: 1e-3)}
 22 | 
 23 | \item{betas}{Coefficients for computing running averages
 24 | of gradient and its square (default: (0.9, 0.999))}
 25 | 
 26 | \item{eps}{Term added to the denominator to improve numerical
 27 | stability (default: 1e-16)}
 28 | 
 29 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)}
 30 | 
 31 | \item{weight_decouple}{Use decoupled weight decay as is done in AdamW?}
 32 | 
 33 | \item{fixed_decay}{This is used when weight_decouple is set as True.
 34 | When fixed_decay == True, weight decay is
 35 | W_new = W_old - W_old * decay.
 36 | When fixed_decay == False, the weight decay is
 37 | W_new = W_old - W_old * decay * learning_rate.
 38 | In this case, weight decay decreases with learning rate.}
 39 | 
 40 | \item{rectify}{Perform the rectified update similar to RAdam?}
 41 | }
 42 | \value{
 43 | A torch optimizer object implementing the \code{step} method.
 44 | }
 45 | \description{
 46 | R implementation of the adabelief optimizer proposed
 47 | by Zhuang et al (2020). We used the pytorch implementation
 48 | developed by the authors which is available at
 49 | https://github.com/jettify/pytorch-optimizer.
 50 | Thanks to Nikolay Novik of his work on python optimizers.
 51 | 
 52 | The original implementation is licensed using the Apache-2.0 software license.
 53 | This implementation is also licensed using Apache-2.0 license.
 54 | 
 55 | From the abstract by the paper by Zhuang et al (2021):
 56 | We propose Adabelief to simultaneously achieve three goals:
 57 | fast convergence as in adaptive methods, good generalization as in SGD,
 58 | and training stability. The intuition for AdaBelief is to adapt
 59 | the stepsize according to the "belief" in the current gradient direction.
 60 | Viewing the exponential moving average of the noisy gradient
 61 | as the prediction of the gradient at the next time step,
 62 | if the observed gradient greatly deviates from the prediction,
 63 | we distrust the current observation and take a small step;
 64 | if the observed gradient is close to the prediction,
 65 | we trust it and take a large step.
 66 | }
 67 | \examples{
 68 | if (torch::torch_is_installed()) {
 69 | # function to demonstrate optimization
 70 | beale <- function(x, y) {
 71 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 72 |  }
 73 | # define optimizer
 74 | optim <- torchopt::optim_adabelief
 75 | # define hyperparams
 76 | opt_hparams <- list(lr = 0.01)
 77 | 
 78 | # starting point
 79 | x0 <- 3
 80 | y0 <- 3
 81 | # create tensor
 82 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 83 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 84 | # instantiate optimizer
 85 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 86 | # run optimizer
 87 | steps <- 400
 88 | x_steps <- numeric(steps)
 89 | y_steps <- numeric(steps)
 90 | for (i in seq_len(steps)) {
 91 |     x_steps[i] <- as.numeric(x)
 92 |     y_steps[i] <- as.numeric(y)
 93 |     optim$zero_grad()
 94 |     z <- beale(x, y)
 95 |     z$backward()
 96 |     optim$step()
 97 | }
 98 | print(paste0("starting value = ", beale(x0, y0)))
 99 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
100 | }
101 | }
102 | \references{
103 | Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda,
104 | Nicha Dvornek, Xenophon Papademetris, James S. Duncan.
105 | "Adabelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients",
106 | 34th Conference on Neural Information Processing Systems (NeurIPS 2020),
107 | Vancouver, Canada.
108 | https://arxiv.org/abs/2010.07468
109 | }
110 | \author{
111 | Gilberto Camara, \email{gilberto.camara@inpe.br}
112 | 
113 | Rolf Simoes, \email{rolf.simoes@inpe.br}
114 | 
115 | Felipe Souza, \email{lipecaso@gmail.com}
116 | 
117 | Alber Sanchez, \email{alber.ipia@inpe.br}
118 | }
119 | 


--------------------------------------------------------------------------------
/man/optim_adabound.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/adabound.R
  3 | \name{optim_adabound}
  4 | \alias{optim_adabound}
  5 | \title{Adabound optimizer}
  6 | \usage{
  7 | optim_adabound(
  8 |   params,
  9 |   lr = 0.001,
 10 |   betas = c(0.9, 0.999),
 11 |   final_lr = 0.1,
 12 |   gamma = 0.001,
 13 |   eps = 1e-08,
 14 |   weight_decay = 0
 15 | )
 16 | }
 17 | \arguments{
 18 | \item{params}{List of parameters to optimize.}
 19 | 
 20 | \item{lr}{Learning rate (default: 1e-3)}
 21 | 
 22 | \item{betas}{Coefficients computing running averages of gradient
 23 | and its square (default: (0.9, 0.999))}
 24 | 
 25 | \item{final_lr}{Final (SGD) learning rate (default: 0.1)}
 26 | 
 27 | \item{gamma}{Convergence speed of the bound functions
 28 | (default: 1e-3)}
 29 | 
 30 | \item{eps}{Term added to the denominator to improve numerical
 31 | stability (default: 1e-8)}
 32 | 
 33 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)}
 34 | }
 35 | \value{
 36 | A torch optimizer object implementing the \code{step} method.
 37 | }
 38 | \description{
 39 | R implementation of the AdaBound optimizer proposed
 40 | by Luo et al.(2019). We used the implementation available at
 41 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py.
 42 | Thanks to Nikolay Novik for providing the pytorch code.
 43 | 
 44 | The original implementation is licensed using the Apache-2.0 software license.
 45 | This implementation is also licensed using Apache-2.0 license.
 46 | 
 47 | AdaBound is a variant of the Adam stochastic optimizer which is
 48 | designed to be more robust to extreme learning rates.
 49 | Dynamic bounds are employed on learning rates,
 50 | where the lower and upper bound are initialized as zero and
 51 | infinity respectively, and they both smoothly converge to a
 52 | constant final step size. AdaBound can be regarded as an adaptive
 53 | method at the beginning of training, and thereafter it gradually and
 54 | smoothly transforms to SGD (or with momentum) as the time step increases.
 55 | }
 56 | \examples{
 57 | if (torch::torch_is_installed()) {
 58 | # function to demonstrate optimization
 59 | beale <- function(x, y) {
 60 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 61 |  }
 62 | # define optimizer
 63 | optim <- torchopt::optim_adabound
 64 | # define hyperparams
 65 | opt_hparams <- list(lr = 0.01)
 66 | 
 67 | # starting point
 68 | x0 <- 3
 69 | y0 <- 3
 70 | # create tensor
 71 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 72 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 73 | # instantiate optimizer
 74 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 75 | # run optimizer
 76 | steps <- 400
 77 | x_steps <- numeric(steps)
 78 | y_steps <- numeric(steps)
 79 | for (i in seq_len(steps)) {
 80 |     x_steps[i] <- as.numeric(x)
 81 |     y_steps[i] <- as.numeric(y)
 82 |     optim$zero_grad()
 83 |     z <- beale(x, y)
 84 |     z$backward()
 85 |     optim$step()
 86 | }
 87 | print(paste0("starting value = ", beale(x0, y0)))
 88 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 89 | }
 90 | }
 91 | \references{
 92 | Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun,
 93 | "Adaptive Gradient Methods with Dynamic Bound of Learning Rate",
 94 | International Conference on Learning Representations (ICLR), 2019.
 95 | https://arxiv.org/abs/1902.09843
 96 | }
 97 | \author{
 98 | Rolf Simoes, \email{rolf.simoes@inpe.br}
 99 | 
100 | Felipe Souza, \email{lipecaso@gmail.com}
101 | 
102 | Alber Sanchez, \email{alber.ipia@inpe.br}
103 | 
104 | Gilberto Camara, \email{gilberto.camara@inpe.br}
105 | }
106 | 


--------------------------------------------------------------------------------
/man/optim_adahessian.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/adahessian.R
 3 | \name{optim_adahessian}
 4 | \alias{optim_adahessian}
 5 | \title{Adahessian optimizer}
 6 | \usage{
 7 | optim_adahessian(
 8 |   params,
 9 |   lr = 0.15,
10 |   betas = c(0.9, 0.999),
11 |   eps = 1e-04,
12 |   weight_decay = 0,
13 |   hessian_power = 0.5
14 | )
15 | }
16 | \arguments{
17 | \item{params}{Iterable of parameters to optimize.}
18 | 
19 | \item{lr}{Learning rate (default: 0.15).}
20 | 
21 | \item{betas}{Coefficients for computing
22 | running averages of gradient
23 | and is square(default: (0.9, 0.999)).}
24 | 
25 | \item{eps}{Term added to the denominator to improve
26 | numerical stability (default: 1e-4).}
27 | 
28 | \item{weight_decay}{L2 penalty (default: 0).}
29 | 
30 | \item{hessian_power}{Hessian power (default: 1.0).}
31 | }
32 | \value{
33 | An optimizer object implementing the \code{step} and \code{zero_grad} methods.
34 | }
35 | \description{
36 | R implementation of the Adahessian optimizer proposed
37 | by Yao et al.(2020). The original implementation is available at
38 | https://github.com/amirgholami/adahessian.
39 | }
40 | \references{
41 | Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K.,
42 | & Mahoney, M. (2021).
43 | ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning.
44 | Proceedings of the AAAI Conference on Artificial Intelligence, 35(12),
45 | 10665-10673.
46 | https://arxiv.org/abs/2006.00719
47 | }
48 | \author{
49 | Rolf Simoes, \email{rolf.simoes@inpe.br}
50 | 
51 | Felipe Souza, \email{lipecaso@gmail.com}
52 | 
53 | Alber Sanchez, \email{alber.ipia@inpe.br}
54 | 
55 | Gilberto Camara, \email{gilberto.camara@inpe.br}
56 | }
57 | 


--------------------------------------------------------------------------------
/man/optim_adamw.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/adamw.R
 3 | \name{optim_adamw}
 4 | \alias{optim_adamw}
 5 | \title{AdamW optimizer}
 6 | \usage{
 7 | optim_adamw(
 8 |   params,
 9 |   lr = 0.01,
10 |   betas = c(0.9, 0.999),
11 |   eps = 1e-08,
12 |   weight_decay = 1e-06
13 | )
14 | }
15 | \arguments{
16 | \item{params}{List of parameters to optimize.}
17 | 
18 | \item{lr}{Learning rate (default: 1e-3)}
19 | 
20 | \item{betas}{Coefficients computing running averages of gradient
21 | and its square (default: (0.9, 0.999))}
22 | 
23 | \item{eps}{Term added to the denominator to improve numerical
24 | stability (default: 1e-8)}
25 | 
26 | \item{weight_decay}{Weight decay (L2 penalty) (default: 1e-6)}
27 | }
28 | \value{
29 | A torch optimizer object implementing the \code{step} method.
30 | }
31 | \description{
32 | R implementation of the AdamW optimizer proposed
33 | by Loshchilov & Hutter (2019). We used the pytorch implementation
34 | developed by Collin Donahue-Oponski available at:
35 | https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3
36 | 
37 | From the abstract by the paper by Loshchilov & Hutter (2019):
38 | L2 regularization and weight decay regularization are equivalent for standard
39 | stochastic gradient descent (when rescaled by the learning rate),
40 | but as we demonstrate this is not the case for adaptive gradient algorithms,
41 | such as Adam. While common implementations of these algorithms
42 | employ L2 regularization (often calling it “weight decay”
43 | in what may be misleading due to the inequivalence we expose),
44 | we propose a simple modification to recover the original formulation of
45 | weight decay regularization by decoupling the weight decay from the optimization
46 | steps taken w.r.t. the loss function
47 | }
48 | \examples{
49 | if (torch::torch_is_installed()) {
50 | # function to demonstrate optimization
51 | beale <- function(x, y) {
52 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
53 |  }
54 | # define optimizer
55 | optim <- torchopt::optim_adamw
56 | # define hyperparams
57 | opt_hparams <- list(lr = 0.01)
58 | 
59 | # starting point
60 | x0 <- 3
61 | y0 <- 3
62 | # create tensor
63 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
64 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
65 | # instantiate optimizer
66 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
67 | # run optimizer
68 | steps <- 400
69 | x_steps <- numeric(steps)
70 | y_steps <- numeric(steps)
71 | for (i in seq_len(steps)) {
72 |     x_steps[i] <- as.numeric(x)
73 |     y_steps[i] <- as.numeric(y)
74 |     optim$zero_grad()
75 |     z <- beale(x, y)
76 |     z$backward()
77 |     optim$step()
78 | }
79 | print(paste0("starting value = ", beale(x0, y0)))
80 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
81 | }
82 | }
83 | \references{
84 | Ilya Loshchilov, Frank Hutter,
85 | "Decoupled Weight Decay Regularization",
86 | International Conference on Learning Representations (ICLR) 2019.
87 | https://arxiv.org/abs/1711.05101
88 | }
89 | \author{
90 | Gilberto Camara, \email{gilberto.camara@inpe.br}
91 | 
92 | Rolf Simoes, \email{rolf.simoes@inpe.br}
93 | 
94 | Felipe Souza, \email{lipecaso@gmail.com}
95 | 
96 | Alber Sanchez, \email{alber.ipia@inpe.br}
97 | }
98 | 


--------------------------------------------------------------------------------
/man/optim_madgrad.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/madgrad.R
 3 | \name{optim_madgrad}
 4 | \alias{optim_madgrad}
 5 | \title{MADGRAD optimizer}
 6 | \usage{
 7 | optim_madgrad(params, lr = 0.01, momentum = 0.9, weight_decay = 0, eps = 1e-06)
 8 | }
 9 | \arguments{
10 | \item{params}{List of parameters to optimize.}
11 | 
12 | \item{lr}{Learning rate (default: 1e-2).}
13 | 
14 | \item{momentum}{Momentum value in  the range [0,1) (default: 0.9).}
15 | 
16 | \item{weight_decay}{Weight decay, i.e. a L2 penalty (default: 0).}
17 | 
18 | \item{eps}{Term added to the denominator outside of
19 | the root operation to improve numerical stability
20 | (default: 1e-6).}
21 | }
22 | \value{
23 | A torch optimizer object implementing the \code{step} method.
24 | }
25 | \description{
26 | A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
27 | Optimization (MADGRAD) is a general purpose optimizer that
28 | can be used in place of SGD or Adam may converge faster and generalize
29 | better. Currently GPU-only. Typically, the same learning rate schedule
30 | that is used for SGD or Adam may be used. The overall learning rate is
31 | not comparable to either method and should be determined by a
32 | hyper-parameter sweep.
33 | 
34 | MADGRAD requires less weight decay than other methods, often as little as
35 | zero. Momentum values used for SGD or Adam's beta1 should work here also.
36 | 
37 | On sparse problems both weight_decay and momentum should be set to 0.
38 | (not yet supported in the R implementation).
39 | }
40 | \examples{
41 | if (torch::torch_is_installed()) {
42 | # function to demonstrate optimization
43 | beale <- function(x, y) {
44 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
45 |  }
46 | # define optimizer
47 | optim <- torchopt::optim_madgrad
48 | # define hyperparams
49 | opt_hparams <- list(lr = 0.01)
50 | 
51 | # starting point
52 | x0 <- 3
53 | y0 <- 3
54 | # create tensor
55 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
56 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
57 | # instantiate optimizer
58 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
59 | # run optimizer
60 | steps <- 400
61 | x_steps <- numeric(steps)
62 | y_steps <- numeric(steps)
63 | for (i in seq_len(steps)) {
64 |     x_steps[i] <- as.numeric(x)
65 |     y_steps[i] <- as.numeric(y)
66 |     optim$zero_grad()
67 |     z <- beale(x, y)
68 |     z$backward()
69 |     optim$step()
70 | }
71 | print(paste0("starting value = ", beale(x0, y0)))
72 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
73 | }
74 | }
75 | \references{
76 | Aaron Defazio, Samy Jelassi,
77 | "Adaptivity without Compromise: A Momentumized, Adaptive, Dual
78 | Averaged Gradient Method for Stochastic Optimization".
79 | https://arxiv.org/abs/2101.11075
80 | }
81 | \author{
82 | Daniel Falbel, \email{dfalbel@gmail.com}
83 | }
84 | 


--------------------------------------------------------------------------------
/man/optim_nadam.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/nadam.R
 3 | \name{optim_nadam}
 4 | \alias{optim_nadam}
 5 | \title{Nadam optimizer}
 6 | \usage{
 7 | optim_nadam(
 8 |   params,
 9 |   lr = 0.002,
10 |   betas = c(0.9, 0.999),
11 |   eps = 1e-08,
12 |   weight_decay = 0,
13 |   momentum_decay = 0.004
14 | )
15 | }
16 | \arguments{
17 | \item{params}{List of parameters to optimize.}
18 | 
19 | \item{lr}{Learning rate (default: 1e-3)}
20 | 
21 | \item{betas}{Coefficients computing running averages of gradient
22 | and its square (default: (0.9, 0.999)).}
23 | 
24 | \item{eps}{Term added to the denominator to improve numerical
25 | stability (default: 1e-8).}
26 | 
27 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0).}
28 | 
29 | \item{momentum_decay}{Momentum_decay (default: 4e-3).}
30 | }
31 | \value{
32 | A torch optimizer object implementing the \code{step} method.
33 | }
34 | \description{
35 | R implementation of the Nadam optimizer proposed
36 | by Dazat (2016).
37 | 
38 | From the abstract by the paper by Dozat (2016):
39 | This work aims to improve upon the recently proposed and
40 | rapidly popularized optimization algorithm Adam (Kingma & Ba, 2014).
41 | Adam has two main components—a momentum component and an adaptive
42 | learning rate component. However, regular momentum can be shown conceptually
43 | and empirically to be inferior to a similar algorithm known as
44 | Nesterov’s accelerated gradient (NAG).
45 | }
46 | \examples{
47 | if (torch::torch_is_installed()) {
48 | # function to demonstrate optimization
49 | beale <- function(x, y) {
50 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
51 |  }
52 | # define optimizer
53 | optim <- torchopt::optim_nadam
54 | # define hyperparams
55 | opt_hparams <- list(lr = 0.01)
56 | 
57 | # starting point
58 | x0 <- 3
59 | y0 <- 3
60 | # create tensor
61 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
62 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
63 | # instantiate optimizer
64 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
65 | # run optimizer
66 | steps <- 400
67 | x_steps <- numeric(steps)
68 | y_steps <- numeric(steps)
69 | for (i in seq_len(steps)) {
70 |     x_steps[i] <- as.numeric(x)
71 |     y_steps[i] <- as.numeric(y)
72 |     optim$zero_grad()
73 |     z <- beale(x, y)
74 |     z$backward()
75 |     optim$step()
76 | }
77 | print(paste0("starting value = ", beale(x0, y0)))
78 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
79 | }
80 | }
81 | \references{
82 | Timothy Dozat,
83 | "Incorporating Nesterov Momentum into Adam",
84 | International Conference on Learning Representations (ICLR) 2016.
85 | https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf
86 | }
87 | \author{
88 | Gilberto Camara, \email{gilberto.camara@inpe.br}
89 | 
90 | Rolf Simoes, \email{rolf.simoes@inpe.br}
91 | 
92 | Felipe Souza, \email{lipecaso@gmail.com}
93 | 
94 | Alber Sanchez, \email{alber.ipia@inpe.br}
95 | }
96 | 


--------------------------------------------------------------------------------
/man/optim_qhadam.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/qhadam.R
  3 | \name{optim_qhadam}
  4 | \alias{optim_qhadam}
  5 | \title{QHAdam optimization algorithm}
  6 | \usage{
  7 | optim_qhadam(
  8 |   params,
  9 |   lr = 0.01,
 10 |   betas = c(0.9, 0.999),
 11 |   eps = 0.001,
 12 |   nus = c(1, 1),
 13 |   weight_decay = 0,
 14 |   decouple_weight_decay = FALSE
 15 | )
 16 | }
 17 | \arguments{
 18 | \item{params}{List of parameters to optimize.}
 19 | 
 20 | \item{lr}{Learning rate (default: 1e-3)}
 21 | 
 22 | \item{betas}{Coefficients computing running averages of gradient
 23 | and its square (default: (0.9, 0.999))}
 24 | 
 25 | \item{eps}{Term added to the denominator to improve numerical
 26 | stability (default: 1e-8)}
 27 | 
 28 | \item{nus}{Immediate discount factors used to
 29 | estimate the gradient and its square
 30 | (default: (1.0, 1.0))}
 31 | 
 32 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)}
 33 | 
 34 | \item{decouple_weight_decay}{Whether to decouple the weight
 35 | decay from the gradient-based optimization step.}
 36 | }
 37 | \value{
 38 | A torch optimizer object implementing the \code{step} method.
 39 | }
 40 | \description{
 41 | R implementation of the QHAdam optimizer proposed
 42 | by Ma and Yarats(2019). We used the implementation available at
 43 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/qhadam.py.
 44 | Thanks to Nikolay Novik for providing the pytorch code.
 45 | 
 46 | The original implementation has been developed by Facebook AI
 47 | and is licensed using the MIT license.
 48 | 
 49 | From the the paper by Ma and Yarats(2019):
 50 | QHAdam is a QH augmented version of Adam, where we
 51 | replace both of Adam's moment estimators with quasi-hyperbolic terms.
 52 | QHAdam decouples the momentum term from the current gradient when
 53 | updating the weights, and decouples the mean squared gradients
 54 | term from the current squared gradient when updating the weights.
 55 | }
 56 | \examples{
 57 | if (torch::torch_is_installed()) {
 58 | # function to demonstrate optimization
 59 | beale <- function(x, y) {
 60 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 61 |  }
 62 | # define optimizer
 63 | optim <- torchopt::optim_qhadam
 64 | # define hyperparams
 65 | opt_hparams <- list(lr = 0.01)
 66 | 
 67 | # starting point
 68 | x0 <- 3
 69 | y0 <- 3
 70 | # create tensor
 71 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 72 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 73 | # instantiate optimizer
 74 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 75 | # run optimizer
 76 | steps <- 400
 77 | x_steps <- numeric(steps)
 78 | y_steps <- numeric(steps)
 79 | for (i in seq_len(steps)) {
 80 |     x_steps[i] <- as.numeric(x)
 81 |     y_steps[i] <- as.numeric(y)
 82 |     optim$zero_grad()
 83 |     z <- beale(x, y)
 84 |     z$backward()
 85 |     optim$step()
 86 | }
 87 | print(paste0("starting value = ", beale(x0, y0)))
 88 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 89 | }
 90 | 
 91 | }
 92 | \references{
 93 | Jerry Ma, Denis Yarats,
 94 | "Quasi-hyperbolic momentum and Adam for deep learning".
 95 | https://arxiv.org/abs/1810.06801
 96 | }
 97 | \author{
 98 | Gilberto Camara, \email{gilberto.camara@inpe.br}
 99 | 
100 | Daniel Falbel, \email{daniel.falble@gmail.com}
101 | 
102 | Rolf Simoes, \email{rolf.simoes@inpe.br}
103 | 
104 | Felipe Souza, \email{lipecaso@gmail.com}
105 | 
106 | Alber Sanchez, \email{alber.ipia@inpe.br}
107 | }
108 | 


--------------------------------------------------------------------------------
/man/optim_radam.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/radam.R
  3 | \name{optim_radam}
  4 | \alias{optim_radam}
  5 | \title{AdamW optimizer}
  6 | \usage{
  7 | optim_radam(
  8 |   params,
  9 |   lr = 0.01,
 10 |   betas = c(0.9, 0.999),
 11 |   eps = 1e-08,
 12 |   weight_decay = 0
 13 | )
 14 | }
 15 | \arguments{
 16 | \item{params}{List of parameters to optimize.}
 17 | 
 18 | \item{lr}{Learning rate (default: 1e-3)}
 19 | 
 20 | \item{betas}{Coefficients computing running averages of gradient
 21 | and its square (default: (0.9, 0.999))}
 22 | 
 23 | \item{eps}{Term added to the denominator to improve numerical
 24 | stability (default: 1e-8)}
 25 | 
 26 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)}
 27 | }
 28 | \value{
 29 | A torch optimizer object implementing the \code{step} method.
 30 | }
 31 | \description{
 32 | R implementation of the RAdam optimizer proposed
 33 | by Liu et al. (2019).
 34 | We used the implementation in PyTorch as a basis for our
 35 | implementation.
 36 | 
 37 | From the abstract by the paper by Liu et al. (2019):
 38 | The learning rate warmup heuristic achieves remarkable success
 39 | in stabilizing training, accelerating convergence and improving
 40 | generalization for adaptive stochastic optimization algorithms
 41 | like RMSprop and Adam. Here, we study its mechanism in details.
 42 | Pursuing the theory behind warmup, we identify a problem of the
 43 | adaptive learning rate (i.e., it has problematically large variance
 44 | in the early stage), suggest warmup works as a variance reduction
 45 | technique, and provide both empirical and theoretical evidence to verify
 46 | our hypothesis. We further propose RAdam, a new variant of Adam,
 47 | by introducing a term to rectify the variance of the adaptive learning rate.
 48 | Extensive experimental results on image classification, language modeling,
 49 | and neural machine translation verify our intuition and demonstrate
 50 | the effectiveness and robustness of our proposed method.
 51 | }
 52 | \examples{
 53 | if (torch::torch_is_installed()) {
 54 | # function to demonstrate optimization
 55 | beale <- function(x, y) {
 56 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 57 |  }
 58 | # define optimizer
 59 | optim <- torchopt::optim_radam
 60 | # define hyperparams
 61 | opt_hparams <- list(lr = 0.01)
 62 | 
 63 | # starting point
 64 | x0 <- 3
 65 | y0 <- 3
 66 | # create tensor
 67 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 68 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 69 | # instantiate optimizer
 70 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 71 | # run optimizer
 72 | steps <- 400
 73 | x_steps <- numeric(steps)
 74 | y_steps <- numeric(steps)
 75 | for (i in seq_len(steps)) {
 76 |     x_steps[i] <- as.numeric(x)
 77 |     y_steps[i] <- as.numeric(y)
 78 |     optim$zero_grad()
 79 |     z <- beale(x, y)
 80 |     z$backward()
 81 |     optim$step()
 82 | }
 83 | print(paste0("starting value = ", beale(x0, y0)))
 84 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 85 | }
 86 | }
 87 | \references{
 88 | Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen,
 89 | Xiaodong Liu, Jianfeng Gao, Jiawei Han,
 90 | "On the Variance of the Adaptive Learning Rate and Beyond",
 91 | International Conference on Learning Representations (ICLR) 2020.
 92 | https://arxiv.org/abs/1908.03265
 93 | }
 94 | \author{
 95 | Gilberto Camara, \email{gilberto.camara@inpe.br}
 96 | 
 97 | Daniel Falbel, \email{daniel.falble@gmail.com}
 98 | 
 99 | Rolf Simoes, \email{rolf.simoes@inpe.br}
100 | 
101 | Felipe Souza, \email{lipecaso@gmail.com}
102 | 
103 | Alber Sanchez, \email{alber.ipia@inpe.br}
104 | }
105 | 


--------------------------------------------------------------------------------
/man/optim_swats.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/swats.R
  3 | \name{optim_swats}
  4 | \alias{optim_swats}
  5 | \title{SWATS optimizer}
  6 | \usage{
  7 | optim_swats(
  8 |   params,
  9 |   lr = 0.01,
 10 |   betas = c(0.9, 0.999),
 11 |   eps = 1e-08,
 12 |   weight_decay = 0,
 13 |   nesterov = FALSE
 14 | )
 15 | }
 16 | \arguments{
 17 | \item{params}{List of parameters to optimize.}
 18 | 
 19 | \item{lr}{Learning rate (default: 1e-3)}
 20 | 
 21 | \item{betas}{Coefficients computing running averages of gradient
 22 | and its square (default: (0.9, 0.999)).}
 23 | 
 24 | \item{eps}{Term added to the denominator to improve numerical
 25 | stability (default: 1e-8).}
 26 | 
 27 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0).}
 28 | 
 29 | \item{nesterov}{Enables Nesterov momentum (default: False).}
 30 | }
 31 | \value{
 32 | A torch optimizer object implementing the \code{step} method.
 33 | }
 34 | \description{
 35 | R implementation of the SWATS optimizer proposed
 36 | by Shekar and Sochee (2018).
 37 | We used the implementation available at
 38 | https://github.com/jettify/pytorch-optimizer/
 39 | Thanks to Nikolay Novik for providing the pytorch code.
 40 | 
 41 | From the abstract by the paper by Shekar and Sochee (2018):
 42 | Adaptive optimization methods such as Adam, Adagrad or RMSprop
 43 | have been found to generalize poorly compared to
 44 | Stochastic gradient descent (SGD). These methods tend to perform well i
 45 | in the initial portion of training but are outperformed by SGD at
 46 | later stages of training. We investigate a hybrid strategy that begins
 47 | training with an adaptive method and switches to SGD
 48 | when a triggering condition is satisfied.
 49 | The condition we propose relates to the projection of Adam
 50 | steps on the gradient subspace. By design, the monitoring process
 51 | for this condition adds very little overhead and does not increase
 52 | the number of hyperparameters in the optimizer.
 53 | }
 54 | \examples{
 55 | if (torch::torch_is_installed()) {
 56 | # function to demonstrate optimization
 57 | beale <- function(x, y) {
 58 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 59 |  }
 60 | # define optimizer
 61 | optim <- torchopt::optim_swats
 62 | # define hyperparams
 63 | opt_hparams <- list(lr = 0.01)
 64 | 
 65 | # starting point
 66 | x0 <- 3
 67 | y0 <- 3
 68 | # create tensor
 69 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 70 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 71 | # instantiate optimizer
 72 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 73 | # run optimizer
 74 | steps <- 400
 75 | x_steps <- numeric(steps)
 76 | y_steps <- numeric(steps)
 77 | for (i in seq_len(steps)) {
 78 |     x_steps[i] <- as.numeric(x)
 79 |     y_steps[i] <- as.numeric(y)
 80 |     optim$zero_grad()
 81 |     z <- beale(x, y)
 82 |     z$backward()
 83 |     optim$step()
 84 | }
 85 | print(paste0("starting value = ", beale(x0, y0)))
 86 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 87 | }
 88 | }
 89 | \references{
 90 | Nitish Shirish Keskar, Richard Socher
 91 | "Improving Generalization Performance by Switching from Adam to SGD".
 92 | International Conference on Learning Representations (ICLR) 2018.
 93 | https://arxiv.org/abs/1712.07628
 94 | }
 95 | \author{
 96 | Gilberto Camara, \email{gilberto.camara@inpe.br}
 97 | 
 98 | Daniel Falbel, \email{daniel.falble@gmail.com}
 99 | 
100 | Rolf Simoes, \email{rolf.simoes@inpe.br}
101 | 
102 | Felipe Souza, \email{lipecaso@gmail.com}
103 | 
104 | Alber Sanchez, \email{alber.ipia@inpe.br}
105 | }
106 | 


--------------------------------------------------------------------------------
/man/optim_yogi.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/yogi.R
  3 | \name{optim_yogi}
  4 | \alias{optim_yogi}
  5 | \title{Yogi optimizer}
  6 | \usage{
  7 | optim_yogi(
  8 |   params,
  9 |   lr = 0.01,
 10 |   betas = c(0.9, 0.999),
 11 |   eps = 0.001,
 12 |   initial_accumulator = 1e-06,
 13 |   weight_decay = 0
 14 | )
 15 | }
 16 | \arguments{
 17 | \item{params}{List of parameters to optimize.}
 18 | 
 19 | \item{lr}{Learning rate (default: 1e-3)}
 20 | 
 21 | \item{betas}{Coefficients computing running averages of gradient
 22 | and its square (default: (0.9, 0.999))}
 23 | 
 24 | \item{eps}{Term added to the denominator to improve numerical
 25 | stability (default: 1e-8)}
 26 | 
 27 | \item{initial_accumulator}{Initial values for first and
 28 | second moments.}
 29 | 
 30 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)}
 31 | }
 32 | \value{
 33 | A torch optimizer object implementing the \code{step} method.
 34 | }
 35 | \description{
 36 | R implementation of the Yogi optimizer proposed
 37 | by Zaheer et al.(2019). We used the implementation available at
 38 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py.
 39 | Thanks to Nikolay Novik for providing the pytorch code.
 40 | 
 41 | The original implementation is licensed using the Apache-2.0 software license.
 42 | This implementation is also licensed using Apache-2.0 license.
 43 | 
 44 | From the abstract by the paper by Zaheer et al.(2019):
 45 | Adaptive gradient methods that rely on scaling gradients
 46 | down by the square root of exponential moving averages
 47 | of past squared gradients, such RMSProp, Adam, Adadelta have
 48 | found wide application in optimizing the nonconvex problems
 49 | that arise in deep learning. However, it has been recently
 50 | demonstrated that such methods can fail to converge even
 51 | in simple convex optimization settings.
 52 | Yogi is a new adaptive optimization algorithm,
 53 | which controls the increase in effective learning rate,
 54 | leading to even better performance with similar theoretical
 55 | guarantees on convergence. Extensive experiments show that
 56 | Yogi with very little hyperparameter tuning outperforms
 57 | methods such as Adam in several challenging machine learning tasks.
 58 | }
 59 | \examples{
 60 | if (torch::torch_is_installed()) {
 61 | # function to demonstrate optimization
 62 | beale <- function(x, y) {
 63 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
 64 |  }
 65 | # define optimizer
 66 | optim <- torchopt::optim_yogi
 67 | # define hyperparams
 68 | opt_hparams <- list(lr = 0.01)
 69 | 
 70 | # starting point
 71 | x0 <- 3
 72 | y0 <- 3
 73 | # create tensor
 74 | x <- torch::torch_tensor(x0, requires_grad = TRUE)
 75 | y <- torch::torch_tensor(y0, requires_grad = TRUE)
 76 | # instantiate optimizer
 77 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 78 | # run optimizer
 79 | steps <- 400
 80 | x_steps <- numeric(steps)
 81 | y_steps <- numeric(steps)
 82 | for (i in seq_len(steps)) {
 83 |     x_steps[i] <- as.numeric(x)
 84 |     y_steps[i] <- as.numeric(y)
 85 |     optim$zero_grad()
 86 |     z <- beale(x, y)
 87 |     z$backward()
 88 |     optim$step()
 89 | }
 90 | print(paste0("starting value = ", beale(x0, y0)))
 91 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps])))
 92 | }
 93 | }
 94 | \references{
 95 | Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv Kumar,
 96 | "Adaptive Methods for Nonconvex Optimization",
 97 | Advances in Neural Information Processing Systems 31 (NeurIPS 2018).
 98 | https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization
 99 | }
100 | \author{
101 | Gilberto Camara, \email{gilberto.camara@inpe.br}
102 | 
103 | Rolf Simoes, \email{rolf.simoes@inpe.br}
104 | 
105 | Felipe Souza, \email{lipecaso@gmail.com}
106 | 
107 | Alber Sanchez, \email{alber.ipia@inpe.br}
108 | }
109 | 


--------------------------------------------------------------------------------
/man/state-set.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-state.R
 3 | \name{state<-}
 4 | \alias{state<-}
 5 | \title{Imported function}
 6 | \usage{
 7 | state(self) <- value
 8 | }
 9 | \description{
10 | Code lifted from a internal function of madgrad package.
11 | Set 'state' attribute of an object.
12 | }
13 | \author{
14 | Daniel Falbel, \email{dfalbel@gmail.com}
15 | }
16 | \keyword{internal}
17 | 


--------------------------------------------------------------------------------
/man/state.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-state.R
 3 | \name{state}
 4 | \alias{state}
 5 | \title{Imported function}
 6 | \usage{
 7 | state(self)
 8 | }
 9 | \description{
10 | Code lifted from a internal function of madgrad package.
11 | Get 'state' attribute of an object.
12 | }
13 | \author{
14 | Daniel Falbel, \email{dfalbel@gmail.com}
15 | }
16 | \keyword{internal}
17 | 


--------------------------------------------------------------------------------
/man/test_optim.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/utils-testopt.R
 3 | \name{test_optim}
 4 | \alias{test_optim}
 5 | \title{Test optimization function}
 6 | \usage{
 7 | test_optim(
 8 |   optim,
 9 |   ...,
10 |   opt_hparams = list(),
11 |   test_fn = "beale",
12 |   steps = 200,
13 |   pt_start_color = "#5050FF7F",
14 |   pt_end_color = "#FF5050FF",
15 |   ln_color = "#FF0000FF",
16 |   ln_weight = 2,
17 |   bg_xy_breaks = 100,
18 |   bg_z_breaks = 32,
19 |   bg_palette = "viridis",
20 |   ct_levels = 10,
21 |   ct_labels = FALSE,
22 |   ct_color = "#FFFFFF7F",
23 |   plot_each_step = FALSE
24 | )
25 | }
26 | \arguments{
27 | \item{optim}{Torch optimizer function.}
28 | 
29 | \item{...}{Additional parameters (passed to \code{image} function).}
30 | 
31 | \item{opt_hparams}{A list with optimizer initialization parameters (default: \code{list()}).
32 | If missing, for each optimizer its individual defaults will be used.}
33 | 
34 | \item{test_fn}{A test function (default \code{"beale"}). You can also pass
35 | a list with 2 elements. The first should be a function that will be optimized
36 | and the second is a function that returns a named vector with \code{x0}, \code{y0}
37 | (the starting points) and \code{xmax}, \code{xmin}, \code{ymax} and \code{ymin} (the domain).
38 | An example: \code{c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)}}
39 | 
40 | \item{steps}{Number of steps to run (default \code{200}).}
41 | 
42 | \item{pt_start_color}{Starting point color (default \code{"#5050FF7F"})}
43 | 
44 | \item{pt_end_color}{Ending point color (default \code{"#FF5050FF"})}
45 | 
46 | \item{ln_color}{Line path color (default \code{"#FF0000FF"})}
47 | 
48 | \item{ln_weight}{Line path weight (default \code{2})}
49 | 
50 | \item{bg_xy_breaks}{Background X and Y resolution (default \code{100})}
51 | 
52 | \item{bg_z_breaks}{Background Z resolution (default \code{32})}
53 | 
54 | \item{bg_palette}{Background palette (default \code{"viridis"})}
55 | 
56 | \item{ct_levels}{Contour levels (default \code{10})}
57 | 
58 | \item{ct_labels}{Should show contour labels? (default \code{FALSE})}
59 | 
60 | \item{ct_color}{Contour color (default \code{"#FFFFFF7F"})}
61 | 
62 | \item{plot_each_step}{Should output each step? (default \code{FALSE})}
63 | }
64 | \value{
65 | No return value, called for producing animated gifs
66 | }
67 | \description{
68 | \code{test_optim()} function is useful to visualize how optimizers solve the
69 | minimization problem by showing the convergence path using a test function.
70 | User can choose any test optimization
71 | \href{https://en.wikipedia.org/wiki/Test_functions_for_optimization}{functions}
72 | provided by \code{torchopt}:
73 | 
74 | \code{"beale"}, \code{"booth"}, \code{"bukin_n6"}, \code{"easom"}, \code{"goldstein_price"},
75 | \code{"himmelblau"}, \code{"levi_n13"}, \code{"matyas"}, \code{"rastrigin"},
76 | \code{"rosenbrock"}, and \code{"sphere"}.
77 | 
78 | Besides these functions, users can pass any function that receives two
79 | numerical values and returns a scalar.
80 | 
81 | Optimization functions are useful to evaluate characteristics of optimization
82 | algorithms, such as convergence rate, precision, robustness, and performance.
83 | These functions give an idea about the different situations that optimization
84 | algorithms can face.
85 | 
86 | Function \code{test_function()} plot the 2D-space of a test optimization function.
87 | }
88 | \author{
89 | Rolf Simoes, \email{rolf.simoes@inpe.br}
90 | }
91 | 


--------------------------------------------------------------------------------
/man/torchopt-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/torchopt-package.R
 3 | \docType{package}
 4 | \name{torchopt-package}
 5 | \alias{torchopt}
 6 | \alias{torchopt-package}
 7 | \title{torchopt: Advanced Optimizers for Torch}
 8 | \description{
 9 | Optimizers for 'torch' deep learning library. These functions include recent results published in the literature and are not part of the optimizers offered in 'torch'. Prospective users should test these optimizers with their data, since performance depends on the specific problem being solved. The packages includes the following optimizers: (a) 'adabelief' by Zhuang et al (2020), \href{https://arxiv.org/abs/2010.07468}{arXiv:2010.07468}; (b) 'adabound' by Luo et al.(2019), \href{https://arxiv.org/abs/1902.09843}{arXiv:1902.09843}; (c) 'adahessian' by Yao et al.(2021) \href{https://arxiv.org/abs/2006.00719}{arXiv:2006.00719}; (d) 'adamw' by Loshchilov & Hutter (2019), \href{https://arxiv.org/abs/1711.05101}{arXiv:1711.05101}; (e) 'madgrad' by Defazio and Jelassi (2021), \href{https://arxiv.org/abs/2101.11075}{arXiv:2101.11075}; (f) 'nadam' by Dozat (2019), \url{https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf}; (g) 'qhadam' by Ma and Yarats(2019), \href{https://arxiv.org/abs/1810.06801}{arXiv:1810.06801}; (h) 'radam' by Liu et al. (2019), \href{https://arxiv.org/abs/1908.03265}{arXiv:1908.03265}; (i) 'swats' by Shekar and Sochee (2018), \href{https://arxiv.org/abs/1712.07628}{arXiv:1712.07628}; (j) 'yogi' by Zaheer et al.(2019), <https:://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization>.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/e-sensing/torchopt/}
15 | }
16 | 
17 | }
18 | \author{
19 | \strong{Maintainer}: Gilberto Camara \email{gilberto.camara@inpe.br}
20 | 
21 | Authors:
22 | \itemize{
23 |   \item Rolf Simoes \email{rolf.simoes@inpe.br}
24 |   \item Daniel Falbel \email{daniel.falbel@gmail.com}
25 |   \item Felipe Souza \email{felipe.carvalho@inpe.br}
26 |   \item Alber Sanchez \email{alber.ipia@inpe.br}
27 | }
28 | 
29 | }
30 | \keyword{internal}
31 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(torchopt)
3 | test_check("torchopt")
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-optimizers.R:
--------------------------------------------------------------------------------
  1 | library(torchopt)
  2 | beale <- function(x, y) {
  3 |     log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2)
  4 | }
  5 | test_optim_valid <- function(optim,
  6 |                              opt_hparams = list(lr = 0.01),
  7 |                              test_fn = "beale",
  8 |                              steps = 100) {
  9 | 
 10 |     # get starting points
 11 |     domain_fn <- get(paste0("domain_",test_fn),
 12 |                      envir = asNamespace("torchopt"),
 13 |                      inherits = FALSE)
 14 |     # get gradient function
 15 |     test_fn <- get(test_fn,
 16 |                    envir = asNamespace("torchopt"),
 17 |                    inherits = FALSE)
 18 | 
 19 |     # starting point
 20 |     dom <- domain_fn()
 21 |     x0 <- dom[["x0"]]
 22 |     y0 <- dom[["y0"]]
 23 | 
 24 |     # create tensor
 25 |     x <- torch::torch_tensor(x0, requires_grad = TRUE)
 26 |     y <- torch::torch_tensor(y0, requires_grad = TRUE)
 27 | 
 28 |     # instantiate optimizer
 29 |     optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams))
 30 | 
 31 |     # run optimizer
 32 |     x_steps <- numeric(steps)
 33 |     y_steps <- numeric(steps)
 34 |     for (i in seq_len(steps)) {
 35 |         x_steps[i] <- as.numeric(x)
 36 |         y_steps[i] <- as.numeric(y)
 37 |         optim$zero_grad()
 38 |         z <- test_fn(x, y)
 39 |         z$backward()
 40 |         optim$step()
 41 |     }
 42 |     return(list(x_steps = x_steps,
 43 |                 y_steps = y_steps))
 44 | }
 45 | test_that("adamw optimizer", {
 46 |     testthat::skip_on_cran()
 47 |     set.seed(12345)
 48 |     xy <- test_optim_valid(
 49 |         optim = torchopt::optim_adamw,
 50 |         opt_hparams = list(lr = 0.05),
 51 |         steps = 400,
 52 |         test_fn = "beale"
 53 |     )
 54 | 
 55 |     x0 <- xy[[1]][1]
 56 |     y0 <- xy[[2]][1]
 57 |     x400 <- xy[[1]][400]
 58 |     y400 <- xy[[2]][400]
 59 |     test_fn0 <- beale(x0, y0)
 60 |     test_fn400 <- beale(x400, y400)
 61 | 
 62 |     expect_true(test_fn0 > test_fn400)
 63 | })
 64 | 
 65 | test_that("adabelief optimizer", {
 66 |     testthat::skip_on_cran()
 67 |     set.seed(12345)
 68 |     xy <- test_optim_valid(
 69 |         optim = optim_adabelief,
 70 |         opt_hparams = list(lr = 0.5),
 71 |         steps = 400,
 72 |         test_fn = "beale"
 73 |     )
 74 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
 75 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
 76 | 
 77 |     expect_true(test_fn0 > test_fn400)
 78 | })
 79 | 
 80 | test_that("adabound optimizer", {
 81 |     testthat::skip_on_cran()
 82 |     set.seed(12345)
 83 |     xy <- test_optim_valid(
 84 |         optim = optim_adabound,
 85 |         opt_hparams = list(lr = 0.5),
 86 |         steps = 400,
 87 |         test_fn = "beale"
 88 |     )
 89 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
 90 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
 91 | 
 92 |     expect_true(test_fn0 > test_fn400)
 93 | 
 94 | })
 95 | test_that("madgrad optimizer", {
 96 |     testthat::skip_on_cran()
 97 |     set.seed(12345)
 98 |     xy <- test_optim_valid(
 99 |         optim = optim_madgrad,
100 |         opt_hparams = list(lr = 0.1),
101 |         steps = 400,
102 |         test_fn = "beale"
103 |     )
104 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
105 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
106 | 
107 |     expect_true(test_fn0 > test_fn400)
108 | 
109 | })
110 | 
111 | test_that("nadam optimizer", {
112 |     testthat::skip_on_cran()
113 |     set.seed(12345)
114 |     xy <- test_optim_valid(
115 |         optim = optim_nadam,
116 |         opt_hparams = list(lr = 0.1),
117 |         steps = 400,
118 |         test_fn = "beale"
119 |     )
120 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
121 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
122 | 
123 |     expect_true(test_fn0 > test_fn400)
124 | 
125 | })
126 | test_that("qhadam optimizer", {
127 |     testthat::skip_on_cran()
128 |     set.seed(12345)
129 |     xy <- test_optim_valid(
130 |         optim = optim_qhadam,
131 |         opt_hparams = list(lr = 0.1),
132 |         steps = 400,
133 |         test_fn = "beale"
134 |     )
135 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
136 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
137 | 
138 |     expect_true(test_fn0 > test_fn400)
139 | 
140 | })
141 | test_that("radam optimizer", {
142 |     testthat::skip_on_cran()
143 |     set.seed(12345)
144 |     xy <- test_optim_valid(
145 |         optim = optim_radam,
146 |         opt_hparams = list(lr = 0.1),
147 |         steps = 400,
148 |         test_fn = "beale"
149 |     )
150 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
151 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
152 | 
153 |     expect_true(test_fn0 > test_fn400)
154 | 
155 | })
156 | test_that("swats optimizer", {
157 |     testthat::skip_on_cran()
158 |     set.seed(234)
159 |     xy <- test_optim_valid(
160 |         optim = optim_swats,
161 |         opt_hparams = list(lr = 0.1),
162 |         steps = 400,
163 |         test_fn = "beale"
164 |     )
165 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
166 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
167 | 
168 |     expect_true(test_fn0 > test_fn400)
169 | 
170 | })
171 | test_that("yogi optimizer", {
172 |     testthat::skip_on_cran()
173 |     set.seed(66)
174 |     xy <- test_optim_valid(
175 |         optim = optim_yogi,
176 |         opt_hparams = list(lr = 0.1),
177 |         steps = 400,
178 |         test_fn = "beale"
179 |     )
180 |     test_fn0 <- beale(xy[[1]][1], xy[[2]][1])
181 |     test_fn400 <- beale(xy[[1]][400], xy[[2]][400])
182 | 
183 |     expect_true(test_fn0 > test_fn400)
184 | 
185 | })
186 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils-testopt.R:
--------------------------------------------------------------------------------
 1 | test_that("can use custom functions with test_opt", {
 2 |     testthat::skip_on_cran()
 3 |     set.seed(1)
 4 |     expect_error(regexp = NA,{
 5 |         test_optim(
 6 |             optim = optim_adamw,
 7 |             test_fn = list(beale, domain_beale),
 8 |             opt_hparams = list(lr = 0.05),
 9 |             steps = 100,
10 |             plot_each_step = TRUE
11 |         )
12 |     })
13 | })
14 | 


--------------------------------------------------------------------------------
/torchopt.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageCheckArgs: --as-cran
22 | 


--------------------------------------------------------------------------------