├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── DESCRIPTION ├── NAMESPACE ├── NEWS.md ├── R ├── adabelief.R ├── adabound.R ├── adahessian.R ├── adamw.R ├── madgrad.R ├── nadam.R ├── qhadam.R ├── radam.R ├── swats.R ├── torchopt-package.R ├── utils-state.R ├── utils-testopt.R └── yogi.R ├── README.Rmd ├── README.md ├── codecov.yml ├── inst └── WORDLIST ├── man ├── figures │ ├── README-chunk-label-.gif │ ├── README-chunk-label-1.jpeg │ ├── README-chunk-label-10.jpeg │ ├── README-chunk-label-2.jpeg │ ├── README-chunk-label-3.jpeg │ ├── README-chunk-label-4.jpeg │ ├── README-chunk-label-5.jpeg │ ├── README-chunk-label-6.jpeg │ ├── README-chunk-label-7.jpeg │ ├── README-chunk-label-8.jpeg │ ├── README-chunk-label-9.jpeg │ ├── README-gif_opt-.gif │ ├── README-opt_fun-1.png │ ├── README-pressure-1.png │ ├── README-test_adabelief-.gif │ ├── README-test_adabound-.gif │ ├── README-test_adahessian-.gif │ ├── README-test_adamw-.gif │ ├── README-test_madgrad-.gif │ ├── README-test_nadam-.gif │ ├── README-test_qhadam-.gif │ ├── README-test_radam-.gif │ ├── README-test_swats-.gif │ └── README-test_yogi-.gif ├── optim_adabelief.Rd ├── optim_adabound.Rd ├── optim_adahessian.Rd ├── optim_adamw.Rd ├── optim_madgrad.Rd ├── optim_nadam.Rd ├── optim_qhadam.Rd ├── optim_radam.Rd ├── optim_swats.Rd ├── optim_yogi.Rd ├── state-set.Rd ├── state.Rd ├── test_optim.Rd └── torchopt-package.Rd ├── tests ├── testthat.R └── testthat │ ├── test-optimizers.R │ └── test-utils-testopt.R └── torchopt.Rproj /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^README\.md$ 2 | ^README\.Rmd$ 3 | ^README_cache$ 4 | ^.*\.Rproj$ 5 | ^\.Rproj\.user$ 6 | ^codecov\.yml$ 7 | ^\.github$ 8 | ^CODE_OF_CONDUCT\.md$ 9 | ^contributing.md$ 10 | ^README_cache$ 11 | ^LICENSE\.md$ 12 | ^README_cache$ 13 | ^man/figures/*$ 14 | ^\.Rprofile$ 15 | ^\.RData$ 16 | ^\.tmp$ 17 | ^CRAN-SUBMISSION$ 18 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: macOS-latest, r: 'release'} 26 | - {os: windows-latest, r: 'release'} 27 | 28 | # Use older ubuntu to maximise backward compatibility 29 | - {os: ubuntu-18.04, r: 'devel', http-user-agent: 'release'} 30 | - {os: ubuntu-18.04, r: 'release'} 31 | 32 | 33 | env: 34 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 35 | R_KEEP_PKG_SOURCE: yes 36 | TORCH_TEST: 1 37 | TORCH_INSTALL: 1 38 | 39 | steps: 40 | - uses: actions/checkout@v2 41 | 42 | - uses: r-lib/actions/setup-pandoc@v1 43 | 44 | - uses: r-lib/actions/setup-r@v2 45 | with: 46 | r-version: ${{ matrix.config.r }} 47 | 48 | - uses: r-lib/actions/setup-r-dependencies@v2 49 | with: 50 | extra-packages: any::rcmdcheck 51 | needs: check 52 | 53 | - uses: r-lib/actions/check-r-package@v2 54 | with: 55 | args: 'c("--no-multiarch", "--no-manual")' 56 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | release: 7 | types: [published] 8 | workflow_dispatch: 9 | 10 | name: pkgdown 11 | 12 | jobs: 13 | pkgdown: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | TORCH_INSTALL: 1 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - uses: r-lib/actions/setup-pandoc@v2 22 | 23 | - uses: r-lib/actions/setup-r@v2 24 | with: 25 | use-public-rspm: true 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v2 28 | with: 29 | extra-packages: any::pkgdown 30 | needs: website 31 | 32 | - name: Deploy package 33 | run: | 34 | git config --local user.name "$GITHUB_ACTOR" 35 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 36 | Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)' 37 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: Commands 8 | 9 | jobs: 10 | document: 11 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} 12 | name: document 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - uses: r-lib/actions/pr-fetch@v1 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - uses: r-lib/actions/setup-r@v1 24 | with: 25 | use-public-rspm: true 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v1 28 | with: 29 | extra-packages: roxygen2 30 | 31 | - name: Document 32 | run: Rscript -e 'roxygen2::roxygenise()' 33 | 34 | - name: commit 35 | run: | 36 | git config --local user.name "$GITHUB_ACTOR" 37 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 38 | git add man/\* NAMESPACE 39 | git commit -m 'Document' 40 | 41 | - uses: r-lib/actions/pr-push@v1 42 | with: 43 | repo-token: ${{ secrets.GITHUB_TOKEN }} 44 | 45 | style: 46 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} 47 | name: style 48 | runs-on: ubuntu-latest 49 | env: 50 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 51 | steps: 52 | - uses: actions/checkout@v2 53 | 54 | - uses: r-lib/actions/pr-fetch@v1 55 | with: 56 | repo-token: ${{ secrets.GITHUB_TOKEN }} 57 | 58 | - uses: r-lib/actions/setup-r@v1 59 | 60 | - name: Install dependencies 61 | run: Rscript -e 'install.packages("styler")' 62 | 63 | - name: Style 64 | run: Rscript -e 'styler::style_pkg()' 65 | 66 | - name: commit 67 | run: | 68 | git config --local user.name "$GITHUB_ACTOR" 69 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 70 | git add \*.R 71 | git commit -m 'Style' 72 | 73 | - uses: r-lib/actions/pr-push@v1 74 | with: 75 | repo-token: ${{ secrets.GITHUB_TOKEN }} 76 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/master/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | TORCH_INSTALL: 1 17 | TORCH_TEST: 1 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - uses: r-lib/actions/setup-r@v2 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | 28 | - name: Test coverage 29 | run: covr::codecov() 30 | shell: Rscript {0} 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | .Rprofile 23 | 24 | # produced vignettes 25 | vignettes/*.html 26 | vignettes/*.pdf 27 | 28 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 29 | .httr-oauth 30 | 31 | # knitr and R markdown default cache directories 32 | *_cache/ 33 | /cache/ 34 | README_cache/* 35 | README_files/* 36 | 37 | # Temporary files created by R markdown 38 | *.utf8.md 39 | *.knit.md 40 | 41 | # R Environment Variables 42 | .Renviron 43 | 44 | # CRAN comments 45 | cran-comments.md 46 | CRAN-SUBMISSION 47 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards 42 | of acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies 54 | when an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail 56 | address, posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at rolf.simoes@inpe.br. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.0, 118 | available at . 119 | 120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 121 | enforcement ladder](https://github.com/mozilla/diversity). 122 | 123 | [homepage]: https://www.contributor-covenant.org 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | . Translations are available at . 127 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Type: Package 2 | Package: torchopt 3 | Title: Advanced Optimizers for Torch 4 | Version: 0.1.4 5 | Authors@R: c( 6 | person("Gilberto", "Camara", , "gilberto.camara.inpe@gmail.com", role = c("aut", "cre")), 7 | person("Rolf", "Simoes", , "rolf.simoes@inpe.br", role = "aut"), 8 | person("Daniel", "Falbel", , "daniel.falbel@gmail.com", role = "aut"), 9 | person("Felipe", "Souza", , "felipe.carvalho@inpe.br", role = "aut") 10 | ) 11 | Maintainer: Gilberto Camara 12 | Description: Optimizers for 'torch' deep learning library. These 13 | functions include recent results published in the literature and are 14 | not part of the optimizers offered in 'torch'. Prospective users 15 | should test these optimizers with their data, since performance 16 | depends on the specific problem being solved. The packages includes 17 | the following optimizers: (a) 'adabelief' by Zhuang et al (2020), 18 | ; (b) 'adabound' by Luo et al.(2019), 19 | ; (c) 'adahessian' by Yao et al.(2021) 20 | ; (d) 'adamw' by Loshchilov & Hutter (2019), 21 | ; (e) 'madgrad' by Defazio and Jelassi (2021), 22 | ; (f) 'nadam' by Dozat (2019), 23 | ; (g) 'qhadam' by 24 | Ma and Yarats(2019), ; (h) 'radam' by Liu et al. 25 | (2019), ; (i) 'swats' by Shekar and Sochee (2018), 26 | ; (j) 'yogi' by Zaheer et al.(2019), 27 | . 28 | License: Apache License (>= 2) 29 | URL: https://github.com/e-sensing/torchopt/ 30 | Depends: 31 | R (>= 4.0.0) 32 | Imports: 33 | graphics, 34 | grDevices, 35 | stats, 36 | torch 37 | Suggests: 38 | testthat 39 | ByteCompile: true 40 | Encoding: UTF-8 41 | Language: en-US 42 | Roxygen: list(markdown = TRUE) 43 | RoxygenNote: 7.2.0 44 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(optim_adabelief) 4 | export(optim_adabound) 5 | export(optim_adahessian) 6 | export(optim_adamw) 7 | export(optim_madgrad) 8 | export(optim_nadam) 9 | export(optim_qhadam) 10 | export(optim_radam) 11 | export(optim_swats) 12 | export(optim_yogi) 13 | export(test_optim) 14 | importFrom(grDevices,hcl.colors) 15 | importFrom(graphics,contour) 16 | importFrom(graphics,image) 17 | importFrom(graphics,lines) 18 | importFrom(graphics,points) 19 | importFrom(stats,runif) 20 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # R Implementation of Advanced Optimizers for Torch 2 | 3 | ### What's new in torchopt version 0.1.4 4 | 5 | * fixed errors in DESCRIPTION 6 | 7 | ### What's new in torchopt version 0.1.2 8 | 9 | * adahessian optimizer 10 | * nadam optimizer 11 | * radam optimizer 12 | * qhadam optimizer 13 | * swats optimizer 14 | 15 | ### What's new in torchopt version 0.1.1 16 | 17 | * adabelief optimizer 18 | * adabound optimizer 19 | * adamw optimizer 20 | * madgrad optimizer 21 | * yogi optimizer 22 | 23 | -------------------------------------------------------------------------------- /R/adabelief.R: -------------------------------------------------------------------------------- 1 | #' @title Adabelief optimizer 2 | #' 3 | #' @name optim_adabelief 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 9 | #' 10 | #' @description 11 | #' R implementation of the adabelief optimizer proposed 12 | #' by Zhuang et al (2020). We used the pytorch implementation 13 | #' developed by the authors which is available at 14 | #' https://github.com/jettify/pytorch-optimizer. 15 | #' Thanks to Nikolay Novik of his work on python optimizers. 16 | #' 17 | #' The original implementation is licensed using the Apache-2.0 software license. 18 | #' This implementation is also licensed using Apache-2.0 license. 19 | #' 20 | #' From the abstract by the paper by Zhuang et al (2021): 21 | #' We propose Adabelief to simultaneously achieve three goals: 22 | #' fast convergence as in adaptive methods, good generalization as in SGD, 23 | #' and training stability. The intuition for AdaBelief is to adapt 24 | #' the stepsize according to the "belief" in the current gradient direction. 25 | #' Viewing the exponential moving average of the noisy gradient 26 | #' as the prediction of the gradient at the next time step, 27 | #' if the observed gradient greatly deviates from the prediction, 28 | #' we distrust the current observation and take a small step; 29 | #' if the observed gradient is close to the prediction, 30 | #' we trust it and take a large step. 31 | 32 | #' @references 33 | #' Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda, 34 | #' Nicha Dvornek, Xenophon Papademetris, James S. Duncan. 35 | #' "Adabelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients", 36 | #' 34th Conference on Neural Information Processing Systems (NeurIPS 2020), 37 | #' Vancouver, Canada. 38 | #' https://arxiv.org/abs/2010.07468 39 | #' 40 | #' @param params List of parameters to optimize. 41 | #' @param lr Learning rate (default: 1e-3) 42 | #' @param betas Coefficients for computing running averages 43 | #' of gradient and its square (default: (0.9, 0.999)) 44 | #' @param eps Term added to the denominator to improve numerical 45 | #' stability (default: 1e-16) 46 | #' @param weight_decay Weight decay (L2 penalty) (default: 0) 47 | #' @param weight_decouple Use decoupled weight decay as is done in AdamW? 48 | #' @param fixed_decay This is used when weight_decouple is set as True. 49 | #' When fixed_decay == True, weight decay is 50 | #' W_new = W_old - W_old * decay. 51 | #' When fixed_decay == False, the weight decay is 52 | #' W_new = W_old - W_old * decay * learning_rate. 53 | #' In this case, weight decay decreases with learning rate. 54 | #' @param rectify Perform the rectified update similar to RAdam? 55 | #' 56 | #' @returns 57 | #' A torch optimizer object implementing the `step` method. 58 | #' 59 | #' @examples 60 | #' if (torch::torch_is_installed()) { 61 | 62 | #' # function to demonstrate optimization 63 | #' beale <- function(x, y) { 64 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 65 | #' } 66 | #' # define optimizer 67 | #' optim <- torchopt::optim_adabelief 68 | #' # define hyperparams 69 | #' opt_hparams <- list(lr = 0.01) 70 | #' 71 | #' # starting point 72 | #' x0 <- 3 73 | #' y0 <- 3 74 | #' # create tensor 75 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 76 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 77 | #' # instantiate optimizer 78 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 79 | #' # run optimizer 80 | #' steps <- 400 81 | #' x_steps <- numeric(steps) 82 | #' y_steps <- numeric(steps) 83 | #' for (i in seq_len(steps)) { 84 | #' x_steps[i] <- as.numeric(x) 85 | #' y_steps[i] <- as.numeric(y) 86 | #' optim$zero_grad() 87 | #' z <- beale(x, y) 88 | #' z$backward() 89 | #' optim$step() 90 | #' } 91 | #' print(paste0("starting value = ", beale(x0, y0))) 92 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 93 | #' } 94 | #' @export 95 | optim_adabelief <- torch::optimizer( 96 | "optim_adabelief", 97 | initialize = function(params, 98 | lr = 0.001, 99 | betas = c(0.9, 0.999), 100 | eps = 1.0e-08, 101 | weight_decay = 1.0e-06, 102 | weight_decouple = TRUE, 103 | fixed_decay = FALSE, 104 | rectify = TRUE) { 105 | if (lr <= 0.0) 106 | stop("Learning rate must be positive.", call. = FALSE) 107 | if (eps < 0.0) 108 | stop("eps must be non-negative.", call. = FALSE) 109 | if (betas[1] > 1.0 | betas[1] <= 0.0) 110 | stop("Invalid beta parameter.", call. = FALSE) 111 | if (betas[2] > 1.0 | betas[1] <= 0.0) 112 | stop("Invalid beta parameter.", call. = FALSE) 113 | if (weight_decay < 0) 114 | stop("Invalid weight_decay value.", call. = FALSE) 115 | 116 | 117 | defaults = list( 118 | lr = lr, 119 | betas = betas, 120 | eps = eps, 121 | weight_decay = weight_decay 122 | ) 123 | super$initialize(params, defaults) 124 | 125 | self$weight_decouple <- weight_decouple 126 | self$rectify <- rectify 127 | self$fixed_decay <- fixed_decay 128 | }, 129 | step = function(closure = NULL){ 130 | loop_fun <- function(group, param, g, p) { 131 | if (is.null(param$grad)) 132 | next 133 | grad <- param$grad 134 | 135 | # Variable initialization 136 | beta1 <- group[['betas']][[1]] 137 | beta2 <- group[['betas']][[2]] 138 | weight_decay <- group[['weight_decay']] 139 | eps <- group[["eps"]] 140 | lr <- group[['lr']] 141 | 142 | # State initialization 143 | if (length(state(param)) == 0) { 144 | state(param) <- list() 145 | state(param)[["rho_inf"]] <- 2.0 / (1.0 - beta2) - 1.0 146 | state(param)[["step"]] <- 0 147 | # Exponential moving average of gradient values 148 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 149 | # Exponential moving average of squared gradient values 150 | state(param)[["exp_avg_var"]] <- torch::torch_zeros_like(param) 151 | } 152 | # Define variables for optimization function 153 | exp_avg <- state(param)[["exp_avg"]] 154 | exp_avg_var <- state(param)[["exp_avg_var"]] 155 | 156 | 157 | # take one step 158 | state(param)[["step"]] <- state(param)[["step"]] + 1 159 | # bias correction 160 | bias_correction1 <- 1 - beta1^state(param)[['step']] 161 | bias_correction2 <- 1 - beta2^state(param)[['step']] 162 | 163 | # perform weight decay, check if decoupled weight decay 164 | if (self$weight_decouple) { 165 | if (!self$fixed_decay) 166 | param$mul_(1.0 - lr * weight_decay) 167 | else 168 | param$mul_(1.0 - weight_decay) 169 | } else { 170 | if (weight_decay != 0) 171 | grad$add_(param, alpha = weight_decay) 172 | } 173 | # update the first moment 174 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 175 | grad_residual <- grad - exp_avg 176 | # Decay the second moment 177 | exp_avg_var$mul_(beta2)$addcmul_(grad_residual, 178 | grad_residual, 179 | value = (1 - beta2)) 180 | 181 | # calculate denominator 182 | denom <- (exp_avg_var$add_(eps)$sqrt()/sqrt(bias_correction2))$add_(eps) 183 | 184 | if (!self$rectify) { 185 | # calculate step size 186 | step_size <- lr / bias_correction1 187 | param$addcdiv_(exp_avg, denom, value = -step_size) 188 | } else { 189 | # calculate rho_t 190 | rho_inf <- state(param)[["rho_inf"]] 191 | step <- state(param)[["step"]] 192 | state(param)[["rho_t"]] <- rho_inf - 193 | (2 * step * beta2 ^ step) / 194 | (1.0 - beta2 ^ step) 195 | rho_t <- state(param)[["rho_t"]] 196 | 197 | # more conservative since it's an approximated value 198 | if (rho_t > 4) { 199 | # perform Adam style update if variance is small 200 | rt = ( 201 | (rho_t - 4.0) * (rho_t - 2.0) * rho_inf 202 | / (rho_inf - 4.0) 203 | / (rho_inf - 2.0) 204 | / rho_t 205 | ) 206 | rt = sqrt(rt) 207 | step_size <- rt * lr / bias_correction1 208 | param$addcdiv_(exp_avg, 209 | denom, 210 | value = -step_size 211 | ) 212 | } else 213 | # perform SGD style update 214 | param$add_(exp_avg, alpha = -lr) 215 | } 216 | } 217 | private$step_helper(closure, loop_fun) 218 | } 219 | ) 220 | -------------------------------------------------------------------------------- /R/adabound.R: -------------------------------------------------------------------------------- 1 | #' @title Adabound optimizer 2 | #' 3 | #' @name optim_adabound 4 | #' 5 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 6 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 7 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 8 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 9 | #' 10 | #' @description 11 | #' R implementation of the AdaBound optimizer proposed 12 | #' by Luo et al.(2019). We used the implementation available at 13 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py. 14 | #' Thanks to Nikolay Novik for providing the pytorch code. 15 | #' 16 | #' The original implementation is licensed using the Apache-2.0 software license. 17 | #' This implementation is also licensed using Apache-2.0 license. 18 | #' 19 | #' AdaBound is a variant of the Adam stochastic optimizer which is 20 | #' designed to be more robust to extreme learning rates. 21 | #' Dynamic bounds are employed on learning rates, 22 | #' where the lower and upper bound are initialized as zero and 23 | #' infinity respectively, and they both smoothly converge to a 24 | #' constant final step size. AdaBound can be regarded as an adaptive 25 | #' method at the beginning of training, and thereafter it gradually and 26 | #' smoothly transforms to SGD (or with momentum) as the time step increases. 27 | #' 28 | #' @references 29 | #' Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, 30 | #' "Adaptive Gradient Methods with Dynamic Bound of Learning Rate", 31 | #' International Conference on Learning Representations (ICLR), 2019. 32 | #' https://arxiv.org/abs/1902.09843 33 | #' 34 | #' @param params List of parameters to optimize. 35 | #' @param lr Learning rate (default: 1e-3) 36 | #' @param betas Coefficients computing running averages of gradient 37 | #' and its square (default: (0.9, 0.999)) 38 | #' @param final_lr Final (SGD) learning rate (default: 0.1) 39 | #' @param gamma Convergence speed of the bound functions 40 | #' (default: 1e-3) 41 | #' @param eps Term added to the denominator to improve numerical 42 | #' stability (default: 1e-8) 43 | #' @param weight_decay Weight decay (L2 penalty) (default: 0) 44 | #' 45 | #' @returns 46 | #' A torch optimizer object implementing the `step` method. 47 | #' @examples 48 | #' if (torch::torch_is_installed()) { 49 | 50 | #' # function to demonstrate optimization 51 | #' beale <- function(x, y) { 52 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 53 | #' } 54 | #' # define optimizer 55 | #' optim <- torchopt::optim_adabound 56 | #' # define hyperparams 57 | #' opt_hparams <- list(lr = 0.01) 58 | #' 59 | #' # starting point 60 | #' x0 <- 3 61 | #' y0 <- 3 62 | #' # create tensor 63 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 64 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 65 | #' # instantiate optimizer 66 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 67 | #' # run optimizer 68 | #' steps <- 400 69 | #' x_steps <- numeric(steps) 70 | #' y_steps <- numeric(steps) 71 | #' for (i in seq_len(steps)) { 72 | #' x_steps[i] <- as.numeric(x) 73 | #' y_steps[i] <- as.numeric(y) 74 | #' optim$zero_grad() 75 | #' z <- beale(x, y) 76 | #' z$backward() 77 | #' optim$step() 78 | #' } 79 | #' print(paste0("starting value = ", beale(x0, y0))) 80 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 81 | #' } 82 | #' @export 83 | optim_adabound <- torch::optimizer( 84 | "optim_adabound", 85 | initialize = function(params, 86 | lr = 1e-3, 87 | betas = c(0.9, 0.999), 88 | final_lr = 0.1, 89 | gamma = 1e-3, 90 | eps = 1e-8, 91 | weight_decay = 0) { 92 | if (lr <= 0.0) 93 | stop("Learning rate must be positive.", call. = FALSE) 94 | if (eps < 0.0) 95 | stop("eps must be non-negative.", call. = FALSE) 96 | if (betas[1] > 1.0 | betas[1] <= 0.0) 97 | stop("Invalid beta parameter.", call. = FALSE) 98 | if (betas[2] > 1.0 | betas[1] <= 0.0) 99 | stop("Invalid beta parameter.", call. = FALSE) 100 | if (final_lr < 0.0) 101 | stop("Learning rate must be positive.", call. = FALSE) 102 | if (gamma > 1.0 | gamma <= 0.0) 103 | stop("Invalid gamma parameter.", call. = FALSE) 104 | if (weight_decay < 0) 105 | stop("Invalid weight_decay value.", call. = FALSE) 106 | 107 | defaults = list( 108 | lr = lr, 109 | betas = betas, 110 | final_lr = final_lr, 111 | gamma = gamma, 112 | eps = eps, 113 | weight_decay = weight_decay 114 | ) 115 | 116 | self$base_lr <- lr 117 | super$initialize(params, defaults) 118 | }, 119 | step = function(closure = NULL) { 120 | loop_fun <- function(group, param, g, p) { 121 | if (is.null(param$grad)) 122 | next 123 | grad <- param$grad 124 | 125 | # State initialization 126 | if (length(state(param)) == 0) { 127 | state(param) <- list() 128 | state(param)[["step"]] <- 0 129 | # Exponential moving average of gradient values 130 | state(param)[["exp_avg"]] <- torch::torch_zeros_like( 131 | param, 132 | memory_format = torch::torch_preserve_format() 133 | ) 134 | # Exponential moving average of squared gradient values 135 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like( 136 | param, 137 | memory_format = torch::torch_preserve_format() 138 | ) 139 | } 140 | exp_avg <- state(param)[["exp_avg"]] 141 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 142 | beta1 <- group[['betas']][[1]] 143 | beta2 <- group[['betas']][[2]] 144 | 145 | state(param)[["step"]] <- state(param)[["step"]] + 1 146 | 147 | if (group[['weight_decay']] != 0) 148 | grad <- grad$add(param, alpha = group[['weight_decay']]) 149 | 150 | # Decay the first and second moment 151 | # running average coefficient 152 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 153 | exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = 1 - beta2) 154 | 155 | # bias correction 156 | bias_correction1 <- 1 - beta1^state(param)[['step']] 157 | bias_correction2 <- 1 - beta2^state(param)[['step']] 158 | step_size <- group[['lr']] * 159 | sqrt(bias_correction2) / bias_correction1 160 | 161 | # Applies bounds on actual learning rate 162 | # lr_scheduler cannot affect final_lr, this is a workaround to 163 | # apply lr decay 164 | final_lr <- group[['final_lr']] * group[['lr']] / self$base_lr 165 | lower_bound <- final_lr * 166 | (1 - 1 / (group[['gamma']] * state(param)[['step']] + 1)) 167 | upper_bound <- final_lr * 168 | (1 + 1 / (group[['gamma']] * state(param)[['step']])) 169 | 170 | # calculate denominator 171 | denom = exp_avg_sq$sqrt()$add_(group[['eps']]) 172 | 173 | step_size <- torch::torch_full_like( 174 | input = denom, 175 | fill_value = step_size) 176 | step_size$div_(denom)$clamp_(lower_bound, upper_bound)$mul_(exp_avg) 177 | 178 | param$add_(-step_size) 179 | } 180 | 181 | private$step_helper(closure, loop_fun) 182 | } 183 | ) 184 | -------------------------------------------------------------------------------- /R/adahessian.R: -------------------------------------------------------------------------------- 1 | #'@title Adahessian optimizer 2 | #' 3 | #'@name optim_adahessian 4 | #' 5 | #'@author Rolf Simoes, \email{rolf.simoes@@inpe.br} 6 | #'@author Felipe Souza, \email{lipecaso@@gmail.com} 7 | #'@author Alber Sanchez, \email{alber.ipia@@inpe.br} 8 | #'@author Gilberto Camara, \email{gilberto.camara@@inpe.br} 9 | #' 10 | #'@description R implementation of the Adahessian optimizer proposed 11 | #' by Yao et al.(2020). The original implementation is available at 12 | #' https://github.com/amirgholami/adahessian. 13 | #' 14 | #' @references 15 | #' Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K., 16 | #' & Mahoney, M. (2021). 17 | #' ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning. 18 | #' Proceedings of the AAAI Conference on Artificial Intelligence, 35(12), 19 | #' 10665-10673. 20 | #' https://arxiv.org/abs/2006.00719 21 | #' 22 | #' @param params Iterable of parameters to optimize. 23 | #' @param lr Learning rate (default: 0.15). 24 | #' @param betas Coefficients for computing 25 | #' running averages of gradient 26 | #' and is square(default: (0.9, 0.999)). 27 | #' @param eps Term added to the denominator to improve 28 | #' numerical stability (default: 1e-4). 29 | #' @param weight_decay L2 penalty (default: 0). 30 | #' @param hessian_power Hessian power (default: 1.0). 31 | #' 32 | #' 33 | #' @returns 34 | #' An optimizer object implementing the `step` and `zero_grad` methods. 35 | #' @export 36 | optim_adahessian <- torch::optimizer( 37 | "optim_adahessian", 38 | initialize = function( 39 | params, 40 | lr = 0.15, 41 | betas = c(0.9, 0.999), 42 | eps = 1e-4, 43 | weight_decay = 0, 44 | hessian_power = 0.5 45 | ) { 46 | if (lr <= 0.0) 47 | rlang::abort("Learning rate must be positive.") 48 | if (eps <= 0.0) 49 | rlang::abort("eps must be non-negative.") 50 | if (betas[1] > 1.0 | betas[1] <= 0.0) 51 | rlang::abort("Invalid beta1 parameter.") 52 | if (betas[2] > 1.0 | betas[2] <= 0.0) 53 | rlang::abort("Invalid beta2 parameter.") 54 | if (hessian_power > 1.0 | hessian_power <= 0.0) 55 | rlang::abort("Invalid hessian power parameter.") 56 | if (weight_decay < 0) 57 | rlang::abort("Invalid weight_decay value") 58 | 59 | torch::torch_manual_seed(sample.int(10^5, 1)) 60 | 61 | defaults = list( 62 | lr = lr, 63 | betas = betas, 64 | eps = eps, 65 | hessian_power = hessian_power, 66 | weight_decay = weight_decay 67 | ) 68 | super$initialize(params, defaults) 69 | }, 70 | # Get an estimate of Hessian Trace. 71 | # This is done by computing the Hessian vector product with a random 72 | # vector v at the current gradient point, to estimate Hessian trace by 73 | # computing the gradient of . 74 | get_trace = function(params, grads){ 75 | # Check backward was called with create_graph set to True 76 | purrr::map(grads, function(g) { 77 | if (purrr::is_null(g$grad_fn)) { 78 | msg <- paste("Gradient tensor does not have grad_fn", 79 | "When calling loss.backward(), set create_graph to True.") 80 | rlang::abort(msg) 81 | } 82 | }) 83 | # list of random tensors [-1, 1] to estimate Hessian matrix diagonal 84 | v <- purrr::map(params, function(p){ 85 | return(2 * torch::torch_randint_like(input = p, 86 | low = 0, 87 | high = 2) - 1) 88 | }) 89 | # Computes the sum of gradients of outputs w.r.t. the inputs. 90 | hvs <- torch::autograd_grad( 91 | outputs = grads, 92 | inputs = params, 93 | grad_outputs = v, 94 | retain_graph = TRUE, 95 | create_graph = TRUE 96 | ) 97 | 98 | # calculate hutchinson trace 99 | # approximation of hessian diagonal 100 | hutchinson_trace <- purrr::map(seq_along(hvs), function(hv_ind){ 101 | hv <- hvs[[hv_ind]] 102 | param_size <- hv$size() 103 | hv_abs <- hv$abs() 104 | if (length(param_size) <= 2) { 105 | return(hv_abs) 106 | } else if (length(param_size) == 3) { 107 | return(torch::torch_mean(hv_abs, dim = 1, keepdim = TRUE)) 108 | } else if (length(param_size) == 4) { 109 | return(torch::torch_mean(hv_abs, dim = c(2, 3), keepdim = TRUE)) 110 | } else 111 | rlang::abort("Only 1D to 4D tensors are supported.") 112 | }) 113 | return(hutchinson_trace) 114 | }, 115 | step = function(closure = NULL) { 116 | 117 | # # Flatten params and grads into lists 118 | groups <- self$param_groups[[1]] 119 | params <- purrr::map(groups$params, function(pg){ 120 | return(pg) 121 | }) 122 | grads <- purrr::map(params, function(p) { 123 | if (!is.null(p$grad)) 124 | return(p$grad) 125 | }) 126 | # Get the Hessian diagonal 127 | self$hut_traces <- self$get_trace(params, grads) 128 | 129 | loop_fun <- function(group, param, g, p) { 130 | 131 | # state initialization 132 | if (length(state(param)) == 0) { 133 | state(param) <- list() 134 | state(param)[["step"]] <- 0 135 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 136 | state(param)[["exp_hessian_diag_sq"]] <- torch::torch_zeros_like(param) 137 | } 138 | # Perform correct stepweight decay as in AdamW 139 | # param$mul_(1 - group[['lr']] * group[['weight_decay']]) 140 | 141 | exp_avg <- state(param)[["exp_avg"]] 142 | exp_hessian_diag_sq <- state(param)[["exp_hessian_diag_sq"]] 143 | 144 | # increase step 145 | state(param)[["step"]] <- state(param)[["step"]] + 1 146 | 147 | # parameters for optimizer 148 | beta1 <- group[['betas']][[1]] 149 | beta2 <- group[['betas']][[2]] 150 | lr <- group[['lr']] 151 | eps <- group[['eps']] 152 | wd <- group[['weight_decay']] 153 | k <- group[['hessian_power']] 154 | step <- state(param)[["step"]] 155 | 156 | 157 | # Decay the first and second moment 158 | # running average coefficient 159 | exp_avg$mul_(beta1)$add_(param$grad, alpha = 1 - beta1) 160 | exp_hessian_diag_sq$mul_(beta2)$addcmul_( 161 | self$hut_traces[[p]], 162 | self$hut_traces[[p]], 163 | value = 1 - beta2 164 | ) 165 | 166 | # bias correction 167 | bias_correction1 <- 1 - beta1 ^ step 168 | bias_correction2 <- 1 - beta2 ^ step 169 | sqrt_bc2 <- sqrt(bias_correction2) 170 | 171 | 172 | # make the square root, and the Hessian power 173 | denom <- ((exp_hessian_diag_sq$sqrt() ^ k) / (sqrt_bc2 ^ k))$add_(eps) 174 | 175 | # update 176 | param$sub_(lr * (exp_avg / bias_correction1 / denom 177 | + wd * param)) 178 | } 179 | private$step_helper(closure, loop_fun) 180 | } 181 | ) 182 | -------------------------------------------------------------------------------- /R/adamw.R: -------------------------------------------------------------------------------- 1 | #' @title AdamW optimizer 2 | #' 3 | #' @name optim_adamw 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 9 | #' 10 | #' @description 11 | #' R implementation of the AdamW optimizer proposed 12 | #' by Loshchilov & Hutter (2019). We used the pytorch implementation 13 | #' developed by Collin Donahue-Oponski available at: 14 | #' https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3 15 | #' 16 | #' From the abstract by the paper by Loshchilov & Hutter (2019): 17 | #' L2 regularization and weight decay regularization are equivalent for standard 18 | #' stochastic gradient descent (when rescaled by the learning rate), 19 | #' but as we demonstrate this is not the case for adaptive gradient algorithms, 20 | #' such as Adam. While common implementations of these algorithms 21 | #' employ L2 regularization (often calling it “weight decay” 22 | #' in what may be misleading due to the inequivalence we expose), 23 | #' we propose a simple modification to recover the original formulation of 24 | #' weight decay regularization by decoupling the weight decay from the optimization 25 | #' steps taken w.r.t. the loss function 26 | #' 27 | #' @references 28 | #' Ilya Loshchilov, Frank Hutter, 29 | #' "Decoupled Weight Decay Regularization", 30 | #' International Conference on Learning Representations (ICLR) 2019. 31 | #' https://arxiv.org/abs/1711.05101 32 | #' 33 | #' @param params List of parameters to optimize. 34 | #' @param lr Learning rate (default: 1e-3) 35 | #' @param betas Coefficients computing running averages of gradient 36 | #' and its square (default: (0.9, 0.999)) 37 | #' @param eps Term added to the denominator to improve numerical 38 | #' stability (default: 1e-8) 39 | #' @param weight_decay Weight decay (L2 penalty) (default: 1e-6) 40 | #' 41 | #' @returns 42 | #' A torch optimizer object implementing the `step` method. 43 | #' @examples 44 | #' if (torch::torch_is_installed()) { 45 | 46 | #' # function to demonstrate optimization 47 | #' beale <- function(x, y) { 48 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 49 | #' } 50 | #' # define optimizer 51 | #' optim <- torchopt::optim_adamw 52 | #' # define hyperparams 53 | #' opt_hparams <- list(lr = 0.01) 54 | #' 55 | #' # starting point 56 | #' x0 <- 3 57 | #' y0 <- 3 58 | #' # create tensor 59 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 60 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 61 | #' # instantiate optimizer 62 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 63 | #' # run optimizer 64 | #' steps <- 400 65 | #' x_steps <- numeric(steps) 66 | #' y_steps <- numeric(steps) 67 | #' for (i in seq_len(steps)) { 68 | #' x_steps[i] <- as.numeric(x) 69 | #' y_steps[i] <- as.numeric(y) 70 | #' optim$zero_grad() 71 | #' z <- beale(x, y) 72 | #' z$backward() 73 | #' optim$step() 74 | #' } 75 | #' print(paste0("starting value = ", beale(x0, y0))) 76 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 77 | #' } 78 | #' @export 79 | optim_adamw <- torch::optimizer( 80 | "optim_adamw", 81 | initialize = function(params, 82 | lr = 0.01, 83 | betas = c(0.9, 0.999), 84 | eps = 1e-8, 85 | weight_decay = 1e-6) { 86 | if (lr <= 0.0) 87 | stop("Learning rate must be positive.", call. = FALSE) 88 | if (eps < 0.0) 89 | stop("eps must be non-negative.", call. = FALSE) 90 | if (betas[1] > 1.0 | betas[1] <= 0.0) 91 | stop("Invalid beta parameter.", call. = FALSE) 92 | if (betas[2] > 1.0 | betas[1] <= 0.0) 93 | stop("Invalid beta parameter.", call. = FALSE) 94 | if (weight_decay < 0) 95 | stop("Invalid weight_decay value.", call. = FALSE) 96 | 97 | defaults = list( 98 | lr = lr, 99 | betas = betas, 100 | eps = eps, 101 | weight_decay = weight_decay 102 | ) 103 | super$initialize(params, defaults) 104 | }, 105 | step = function(closure = NULL){ 106 | loop_fun <- function(group, param, g, p) { 107 | if (is.null(param$grad)) 108 | next 109 | grad <- param$grad 110 | 111 | # State initialization 112 | if (length(state(param)) == 0) { 113 | state(param) <- list() 114 | state(param)[["step"]] <- 0 115 | # Exponential moving average of gradient values 116 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 117 | # Exponential moving average of squared gradient values 118 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param) 119 | } 120 | # Define variables for optimization function 121 | exp_avg <- state(param)[["exp_avg"]] 122 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 123 | beta1 <- group[['betas']][[1]] 124 | beta2 <- group[['betas']][[2]] 125 | weight_decay <- group[['weight_decay']] 126 | eps <- group[["eps"]] 127 | lr <- group[['lr']] 128 | 129 | # take one step 130 | state(param)[["step"]] <- state(param)[["step"]] + 1 131 | 132 | # Decay the first moment 133 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 134 | # Decay the second moment 135 | exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2)) 136 | 137 | # calculate denominator 138 | denom = exp_avg_sq$sqrt()$add_(eps) 139 | 140 | # bias correction 141 | bias_correction1 <- 1 - beta1^state(param)[['step']] 142 | bias_correction2 <- 1 - beta2^state(param)[['step']] 143 | # calculate step size 144 | step_size <- lr * sqrt(bias_correction2) / bias_correction1 145 | 146 | # L2 correction (different from adam) 147 | if (weight_decay != 0) 148 | param$add_(param, -weight_decay * lr) 149 | # go to next step 150 | param$addcdiv_(exp_avg, denom, value = -step_size) 151 | } 152 | private$step_helper(closure, loop_fun) 153 | } 154 | ) 155 | -------------------------------------------------------------------------------- /R/madgrad.R: -------------------------------------------------------------------------------- 1 | #' @title MADGRAD optimizer 2 | #' 3 | #' @name optim_madgrad 4 | #' 5 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com} 6 | #' 7 | #' @description 8 | #' A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic 9 | #' Optimization (MADGRAD) is a general purpose optimizer that 10 | #' can be used in place of SGD or Adam may converge faster and generalize 11 | #' better. Currently GPU-only. Typically, the same learning rate schedule 12 | #' that is used for SGD or Adam may be used. The overall learning rate is 13 | #' not comparable to either method and should be determined by a 14 | #' hyper-parameter sweep. 15 | #' 16 | #' MADGRAD requires less weight decay than other methods, often as little as 17 | #' zero. Momentum values used for SGD or Adam's beta1 should work here also. 18 | #' 19 | #' On sparse problems both weight_decay and momentum should be set to 0. 20 | #' (not yet supported in the R implementation). 21 | #' 22 | #' 23 | #' @references 24 | #' Aaron Defazio, Samy Jelassi, 25 | #' "Adaptivity without Compromise: A Momentumized, Adaptive, Dual 26 | #' Averaged Gradient Method for Stochastic Optimization". 27 | #' https://arxiv.org/abs/2101.11075 28 | #' 29 | #' @param params List of parameters to optimize. 30 | #' @param lr Learning rate (default: 1e-2). 31 | #' @param momentum Momentum value in the range [0,1) (default: 0.9). 32 | #' @param weight_decay Weight decay, i.e. a L2 penalty (default: 0). 33 | #' @param eps Term added to the denominator outside of 34 | #' the root operation to improve numerical stability 35 | #' (default: 1e-6). 36 | #' 37 | #' @returns 38 | #' A torch optimizer object implementing the `step` method. 39 | #' @examples 40 | #' if (torch::torch_is_installed()) { 41 | 42 | #' # function to demonstrate optimization 43 | #' beale <- function(x, y) { 44 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 45 | #' } 46 | #' # define optimizer 47 | #' optim <- torchopt::optim_madgrad 48 | #' # define hyperparams 49 | #' opt_hparams <- list(lr = 0.01) 50 | #' 51 | #' # starting point 52 | #' x0 <- 3 53 | #' y0 <- 3 54 | #' # create tensor 55 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 56 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 57 | #' # instantiate optimizer 58 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 59 | #' # run optimizer 60 | #' steps <- 400 61 | #' x_steps <- numeric(steps) 62 | #' y_steps <- numeric(steps) 63 | #' for (i in seq_len(steps)) { 64 | #' x_steps[i] <- as.numeric(x) 65 | #' y_steps[i] <- as.numeric(y) 66 | #' optim$zero_grad() 67 | #' z <- beale(x, y) 68 | #' z$backward() 69 | #' optim$step() 70 | #' } 71 | #' print(paste0("starting value = ", beale(x0, y0))) 72 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 73 | #' } 74 | #' @export 75 | optim_madgrad <- torch::optimizer( 76 | "optim_madgrad", 77 | initialize = function(params, 78 | lr = 1e-2, 79 | momentum = 0.9, 80 | weight_decay = 0, 81 | eps = 1e-6) { 82 | 83 | if (momentum < 0 || momentum >= 1) 84 | stop("Momentum must be in the range [0,1].") 85 | 86 | if (lr <= 0) 87 | stop("Learning rate must be positive.") 88 | 89 | if (weight_decay < 0) 90 | stop("Weight decay must be non-negative.") 91 | 92 | if (eps < 0) 93 | stop("Eps must be non-negative.") 94 | 95 | defaults <- list(lr = lr, 96 | eps = eps, 97 | momentum = momentum, 98 | weight_decay = weight_decay) 99 | 100 | super$initialize(params, defaults) 101 | }, 102 | step = function(closure = NULL) { 103 | if (is.null(self$k)) 104 | self$k <- 0 105 | loss <- super$step_helper( 106 | closure = closure, 107 | loop_fun = function(group, param, ...) { 108 | eps <- group$eps 109 | lr <- group$lr + eps 110 | decay <- group$weight_decay 111 | momentum <- group$momentum 112 | 113 | ck <- 1 - momentum 114 | lamb <- lr * (self$k + 1)^0.5 115 | 116 | grad <- param$grad 117 | 118 | if (is.null(state(param))) { 119 | state(param) <- list() 120 | state(param)[["grad_sum_sq"]] <- torch::torch_zeros_like(param)$detach() 121 | state(param)[["s"]] <- torch::torch_zeros_like(param)$detach() 122 | if (momentum != 0) 123 | state(param)[["x0"]] <- param$clone() 124 | } 125 | 126 | if (decay != 0) { 127 | grad$add_(param, alpha = decay) 128 | } 129 | 130 | if (momentum == 0) { 131 | # Compute x_0 from other known quantities 132 | rms <- state(param)[["grad_sum_sq"]]$pow(1 / 3)$add_(eps) 133 | x0 <- param$addcdiv(state(param)[["s"]], rms, value = 1) 134 | } else { 135 | x0 <- state(param)[["x0"]] 136 | } 137 | 138 | # Accumulate second moments 139 | state(param)[["grad_sum_sq"]]$addcmul_(grad, grad, value = lamb) 140 | rms <- state(param)[["grad_sum_sq"]]$pow(1 / 3)$add_(eps) 141 | 142 | # Update s 143 | state(param)[["s"]]$add_(grad, alpha = lamb) 144 | 145 | # Step 146 | if (momentum == 0) { 147 | param$copy_(x0$addcdiv(state(param)[["s"]], rms, value = -1)) 148 | } else { 149 | z <- x0$addcdiv(state(param)[["s"]], rms, value = -1) 150 | } 151 | 152 | # p is a moving average of z 153 | param$mul_(1 - ck)$add_(z, alpha = ck) 154 | 155 | }) 156 | self$k <- self$k + 1 157 | loss 158 | } 159 | ) 160 | 161 | 162 | state <- function(self) { 163 | attr(self, "state") 164 | } 165 | 166 | `state<-` <- function(self, value) { 167 | attr(self, "state") <- value 168 | self 169 | } 170 | 171 | -------------------------------------------------------------------------------- /R/nadam.R: -------------------------------------------------------------------------------- 1 | #' @title Nadam optimizer 2 | #' 3 | #' @name optim_nadam 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 9 | #' 10 | #' @description 11 | #' R implementation of the Nadam optimizer proposed 12 | #' by Dazat (2016). 13 | #' 14 | #' From the abstract by the paper by Dozat (2016): 15 | #' This work aims to improve upon the recently proposed and 16 | #' rapidly popularized optimization algorithm Adam (Kingma & Ba, 2014). 17 | #' Adam has two main components—a momentum component and an adaptive 18 | #' learning rate component. However, regular momentum can be shown conceptually 19 | #' and empirically to be inferior to a similar algorithm known as 20 | #' Nesterov’s accelerated gradient (NAG). 21 | #' 22 | #' @references 23 | #' Timothy Dozat, 24 | #' "Incorporating Nesterov Momentum into Adam", 25 | #' International Conference on Learning Representations (ICLR) 2016. 26 | #' https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf 27 | #' 28 | #' @param params List of parameters to optimize. 29 | #' @param lr Learning rate (default: 1e-3) 30 | #' @param betas Coefficients computing running averages of gradient 31 | #' and its square (default: (0.9, 0.999)). 32 | #' @param eps Term added to the denominator to improve numerical 33 | #' stability (default: 1e-8). 34 | #' @param weight_decay Weight decay (L2 penalty) (default: 0). 35 | #' @param momentum_decay Momentum_decay (default: 4e-3). 36 | #' 37 | #' 38 | #' @returns 39 | #' A torch optimizer object implementing the `step` method. 40 | #' @examples 41 | #' if (torch::torch_is_installed()) { 42 | 43 | #' # function to demonstrate optimization 44 | #' beale <- function(x, y) { 45 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 46 | #' } 47 | #' # define optimizer 48 | #' optim <- torchopt::optim_nadam 49 | #' # define hyperparams 50 | #' opt_hparams <- list(lr = 0.01) 51 | #' 52 | #' # starting point 53 | #' x0 <- 3 54 | #' y0 <- 3 55 | #' # create tensor 56 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 57 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 58 | #' # instantiate optimizer 59 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 60 | #' # run optimizer 61 | #' steps <- 400 62 | #' x_steps <- numeric(steps) 63 | #' y_steps <- numeric(steps) 64 | #' for (i in seq_len(steps)) { 65 | #' x_steps[i] <- as.numeric(x) 66 | #' y_steps[i] <- as.numeric(y) 67 | #' optim$zero_grad() 68 | #' z <- beale(x, y) 69 | #' z$backward() 70 | #' optim$step() 71 | #' } 72 | #' print(paste0("starting value = ", beale(x0, y0))) 73 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 74 | #' } 75 | #' @export 76 | optim_nadam <- torch::optimizer( 77 | "optim_nadam", 78 | initialize = function(params, 79 | lr = 0.002, 80 | betas = c(0.9, 0.999), 81 | eps = 1e-8, 82 | weight_decay = 0, 83 | momentum_decay = 4.0e-03) { 84 | if (lr <= 0.0) 85 | stop("Learning rate must be positive.", call. = FALSE) 86 | if (eps < 0.0) 87 | stop("eps must be non-negative.", call. = FALSE) 88 | if (betas[1] > 1.0 | betas[1] <= 0.0) 89 | stop("Invalid beta parameter.", call. = FALSE) 90 | if (betas[2] > 1.0 | betas[1] <= 0.0) 91 | stop("Invalid beta parameter.", call. = FALSE) 92 | if (weight_decay < 0) 93 | stop("Invalid weight_decay value.", call. = FALSE) 94 | if (momentum_decay < 0) 95 | stop("Invalid momentum_decay value.", call. = FALSE) 96 | 97 | defaults = list( 98 | lr = lr, 99 | betas = betas, 100 | eps = eps, 101 | weight_decay = weight_decay, 102 | momentum_decay = momentum_decay 103 | ) 104 | super$initialize(params, defaults) 105 | }, 106 | step = function(closure = NULL){ 107 | loop_fun <- function(group, param, g, p) { 108 | if (is.null(param$grad)) 109 | next 110 | grad <- param$grad 111 | 112 | # State initialization 113 | if (length(state(param)) == 0) { 114 | state(param) <- list() 115 | state(param)[["step"]] <- torch::torch_tensor(0) 116 | # momentum product 117 | state(param)[["mu_product"]] <- torch::torch_tensor(1.) 118 | # Exponential moving average of gradient values 119 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 120 | # Exponential moving average of squared gradient values 121 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param) 122 | } 123 | # Define variables for optimization function 124 | exp_avg <- state(param)[["exp_avg"]] 125 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 126 | step <- state(param)[["step"]] 127 | mu_product <- state(param)[["mu_product"]] 128 | beta1 <- group[['betas']][[1]] 129 | beta2 <- group[['betas']][[2]] 130 | weight_decay <- group[['weight_decay']] 131 | eps <- group[["eps"]] 132 | lr <- group[['lr']] 133 | momentum_decay <- group[["momentum_decay"]] 134 | 135 | # take one step 136 | state(param)[["step"]] <- state(param)[["step"]] + 1 137 | 138 | # bias correction 139 | bias_correction2 <- 1 - beta2^state(param)[['step']] 140 | 141 | # weight_decay 142 | if (weight_decay != 0) 143 | grad = grad$add(param, alpha = weight_decay) 144 | 145 | # calculate the momentum cache \mu^{t} and \mu^{t+1} 146 | mu = beta1 * (1. - 0.5 * (0.96 ^ (step * momentum_decay))) 147 | mu_next = beta1 * (1. - 0.5 * (0.96 ^ ((step + 1) * momentum_decay))) 148 | 149 | # update momentum 150 | mu_product <- mu_product * mu 151 | mu_product_next <- mu_product * mu * mu_next 152 | 153 | # decay the first and second moment running average coefficient 154 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 155 | exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = 1 - beta2) 156 | 157 | # calculate denominator 158 | denom = exp_avg_sq$div(bias_correction2)$sqrt()$add_(eps) 159 | 160 | # update objective function 161 | param$addcdiv_(grad, denom, 162 | value = -lr * (1. - mu) / (1. - mu_product$item())) 163 | param$addcdiv_(exp_avg, denom, 164 | value = -lr * mu_next / (1. - mu_product_next$item())) 165 | 166 | } 167 | private$step_helper(closure, loop_fun) 168 | } 169 | ) 170 | -------------------------------------------------------------------------------- /R/qhadam.R: -------------------------------------------------------------------------------- 1 | #' @title QHAdam optimization algorithm 2 | #' 3 | #' @name optim_qhadam 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com} 7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 10 | #' 11 | #' @description 12 | #' R implementation of the QHAdam optimizer proposed 13 | #' by Ma and Yarats(2019). We used the implementation available at 14 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/qhadam.py. 15 | #' Thanks to Nikolay Novik for providing the pytorch code. 16 | #' 17 | #' The original implementation has been developed by Facebook AI 18 | #' and is licensed using the MIT license. 19 | #' 20 | #' From the the paper by Ma and Yarats(2019): 21 | #' QHAdam is a QH augmented version of Adam, where we 22 | #' replace both of Adam's moment estimators with quasi-hyperbolic terms. 23 | #' QHAdam decouples the momentum term from the current gradient when 24 | #' updating the weights, and decouples the mean squared gradients 25 | #' term from the current squared gradient when updating the weights. 26 | #' 27 | #' 28 | #' @references 29 | #' Jerry Ma, Denis Yarats, 30 | #' "Quasi-hyperbolic momentum and Adam for deep learning". 31 | #' https://arxiv.org/abs/1810.06801 32 | #' 33 | #' @param params List of parameters to optimize. 34 | #' @param lr Learning rate (default: 1e-3) 35 | #' @param betas Coefficients computing running averages of gradient 36 | #' and its square (default: (0.9, 0.999)) 37 | #' @param nus Immediate discount factors used to 38 | #' estimate the gradient and its square 39 | #' (default: (1.0, 1.0)) 40 | #' @param eps Term added to the denominator to improve numerical 41 | #' stability (default: 1e-8) 42 | #' @param weight_decay Weight decay (L2 penalty) (default: 0) 43 | #' @param decouple_weight_decay Whether to decouple the weight 44 | #' decay from the gradient-based optimization step. 45 | #' 46 | #' @returns 47 | #' A torch optimizer object implementing the `step` method. 48 | #' @examples 49 | #' if (torch::torch_is_installed()) { 50 | 51 | #' # function to demonstrate optimization 52 | #' beale <- function(x, y) { 53 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 54 | #' } 55 | #' # define optimizer 56 | #' optim <- torchopt::optim_qhadam 57 | #' # define hyperparams 58 | #' opt_hparams <- list(lr = 0.01) 59 | #' 60 | #' # starting point 61 | #' x0 <- 3 62 | #' y0 <- 3 63 | #' # create tensor 64 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 65 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 66 | #' # instantiate optimizer 67 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 68 | #' # run optimizer 69 | #' steps <- 400 70 | #' x_steps <- numeric(steps) 71 | #' y_steps <- numeric(steps) 72 | #' for (i in seq_len(steps)) { 73 | #' x_steps[i] <- as.numeric(x) 74 | #' y_steps[i] <- as.numeric(y) 75 | #' optim$zero_grad() 76 | #' z <- beale(x, y) 77 | #' z$backward() 78 | #' optim$step() 79 | #' } 80 | #' print(paste0("starting value = ", beale(x0, y0))) 81 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 82 | #' } 83 | #' 84 | #' @export 85 | optim_qhadam <- torch::optimizer( 86 | "optim_qhadam", 87 | initialize = function(params, 88 | lr = 0.01, 89 | betas = c(0.9, 0.999), 90 | eps = 0.001, 91 | nus = c(1.0, 1.0), 92 | weight_decay = 0, 93 | decouple_weight_decay = FALSE) { 94 | if (lr <= 0.0) 95 | stop("Learning rate must be positive.", call. = FALSE) 96 | if (eps < 0.0) 97 | stop("eps must be non-negative.", call. = FALSE) 98 | if (betas[1] > 1.0 | betas[1] <= 0.0) 99 | stop("Invalid beta parameter.", call. = FALSE) 100 | if (betas[2] > 1.0 | betas[1] <= 0.0) 101 | stop("Invalid beta parameter.", call. = FALSE) 102 | if (weight_decay < 0) 103 | stop("Invalid weight_decay value.", call. = FALSE) 104 | 105 | defaults = list( 106 | lr = lr, 107 | betas = betas, 108 | eps = eps, 109 | nus = nus, 110 | weight_decay = weight_decay, 111 | decouple_weight_decay = decouple_weight_decay 112 | ) 113 | super$initialize(params, defaults) 114 | }, 115 | step = function(closure = NULL) { 116 | loop_fun <- function(group, param, g, p) { 117 | if (is.null(param$grad)) 118 | next 119 | 120 | # define parameters 121 | beta1 <- group[['betas']][[1]] 122 | beta2 <- group[['betas']][[2]] 123 | nu1 <- group[['nus']][[1]] 124 | nu2 <- group[['nus']][[2]] 125 | weight_decay <- group[['weight_decay']] 126 | decouple_weight_decay <- group[["decouple_weight_decay"]] 127 | eps <- group[["eps"]] 128 | lr <- group[['lr']] 129 | 130 | d_p <- param$grad 131 | 132 | if (weight_decay != 0) { 133 | if (decouple_weight_decay) 134 | param$mul_(1 - lr * weight_decay) 135 | else 136 | d_p$add_(weight_decay, param) 137 | } 138 | 139 | d_p_sq = d_p$mul(d_p) 140 | 141 | 142 | # State initialization 143 | # State initialization 144 | if (length(state(param)) == 0) { 145 | state(param) <- list() 146 | 147 | state(param)[["beta1_weight"]] <- 0.0 148 | state(param)[["beta2_weight"]] <- 0.0 149 | # Exponential moving average of gradient values 150 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 151 | # Exponential moving average of squared gradient values 152 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param) 153 | } 154 | # Define variables for optimization function 155 | state(param)[["beta1_weight"]] <- 1.0 + beta1 * state(param)[["beta1_weight"]] 156 | state(param)[["beta2_weight"]] <- 1.0 + beta2 * state(param)[["beta2_weight"]] 157 | 158 | beta1_weight <- state(param)[["beta1_weight"]] 159 | beta2_weight <- state(param)[["beta2_weight"]] 160 | 161 | exp_avg <- state(param)[["exp_avg"]] 162 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 163 | 164 | beta1_adj <- 1.0 - (1.0 / beta1_weight) 165 | beta2_adj <- 1.0 - (1.0 / beta2_weight) 166 | exp_avg$mul_(beta1_adj)$add_(d_p, alpha = 1.0 - beta1_adj) 167 | exp_avg_sq$mul_(beta2_adj)$add_(d_p_sq, alpha = 1.0 - beta2_adj) 168 | 169 | avg_grad <- exp_avg$mul(nu1) 170 | if (nu1 != 1.0) 171 | avg_grad$add_(d_p, alpha = 1.0 - nu1) 172 | 173 | avg_grad_rms = exp_avg_sq$mul(nu2) 174 | if (nu2 != 1.0) 175 | avg_grad_rms$add_(d_p_sq, alpha = 1.0 - nu2) 176 | avg_grad_rms$sqrt_() 177 | if (eps != 0.0) 178 | avg_grad_rms$add_(eps) 179 | 180 | param$addcdiv_(avg_grad, avg_grad_rms, value = -lr) 181 | } 182 | private$step_helper(closure, loop_fun) 183 | } 184 | ) 185 | -------------------------------------------------------------------------------- /R/radam.R: -------------------------------------------------------------------------------- 1 | #' @title AdamW optimizer 2 | #' 3 | #' @name optim_radam 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com} 7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 10 | #' 11 | #' @description 12 | #' R implementation of the RAdam optimizer proposed 13 | #' by Liu et al. (2019). 14 | #' We used the implementation in PyTorch as a basis for our 15 | #' implementation. 16 | #' 17 | #' From the abstract by the paper by Liu et al. (2019): 18 | #' The learning rate warmup heuristic achieves remarkable success 19 | #' in stabilizing training, accelerating convergence and improving 20 | #' generalization for adaptive stochastic optimization algorithms 21 | #' like RMSprop and Adam. Here, we study its mechanism in details. 22 | #' Pursuing the theory behind warmup, we identify a problem of the 23 | #' adaptive learning rate (i.e., it has problematically large variance 24 | #' in the early stage), suggest warmup works as a variance reduction 25 | #' technique, and provide both empirical and theoretical evidence to verify 26 | #' our hypothesis. We further propose RAdam, a new variant of Adam, 27 | #' by introducing a term to rectify the variance of the adaptive learning rate. 28 | #' Extensive experimental results on image classification, language modeling, 29 | #' and neural machine translation verify our intuition and demonstrate 30 | #' the effectiveness and robustness of our proposed method. 31 | #' 32 | #' @references 33 | #' Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, 34 | #' Xiaodong Liu, Jianfeng Gao, Jiawei Han, 35 | #' "On the Variance of the Adaptive Learning Rate and Beyond", 36 | #' International Conference on Learning Representations (ICLR) 2020. 37 | #' https://arxiv.org/abs/1908.03265 38 | #' 39 | #' @param params List of parameters to optimize. 40 | #' @param lr Learning rate (default: 1e-3) 41 | #' @param betas Coefficients computing running averages of gradient 42 | #' and its square (default: (0.9, 0.999)) 43 | #' @param eps Term added to the denominator to improve numerical 44 | #' stability (default: 1e-8) 45 | #' @param weight_decay Weight decay (L2 penalty) (default: 0) 46 | #' 47 | #' @returns 48 | #' A torch optimizer object implementing the `step` method. 49 | #' @examples 50 | #' if (torch::torch_is_installed()) { 51 | 52 | #' # function to demonstrate optimization 53 | #' beale <- function(x, y) { 54 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 55 | #' } 56 | #' # define optimizer 57 | #' optim <- torchopt::optim_radam 58 | #' # define hyperparams 59 | #' opt_hparams <- list(lr = 0.01) 60 | #' 61 | #' # starting point 62 | #' x0 <- 3 63 | #' y0 <- 3 64 | #' # create tensor 65 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 66 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 67 | #' # instantiate optimizer 68 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 69 | #' # run optimizer 70 | #' steps <- 400 71 | #' x_steps <- numeric(steps) 72 | #' y_steps <- numeric(steps) 73 | #' for (i in seq_len(steps)) { 74 | #' x_steps[i] <- as.numeric(x) 75 | #' y_steps[i] <- as.numeric(y) 76 | #' optim$zero_grad() 77 | #' z <- beale(x, y) 78 | #' z$backward() 79 | #' optim$step() 80 | #' } 81 | #' print(paste0("starting value = ", beale(x0, y0))) 82 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 83 | #' } 84 | #' @export 85 | optim_radam <- torch::optimizer( 86 | "optim_radam", 87 | initialize = function(params, 88 | lr = 0.01, 89 | betas = c(0.9, 0.999), 90 | eps = 1e-8, 91 | weight_decay = 0) { 92 | if (lr <= 0.0) 93 | stop("Learning rate must be positive.", call. = FALSE) 94 | if (eps < 0.0) 95 | stop("eps must be non-negative.", call. = FALSE) 96 | if (betas[1] > 1.0 | betas[1] <= 0.0) 97 | stop("Invalid beta parameter.", call. = FALSE) 98 | if (betas[2] > 1.0 | betas[1] <= 0.0) 99 | stop("Invalid beta parameter.", call. = FALSE) 100 | if (weight_decay < 0) 101 | stop("Invalid weight_decay value.", call. = FALSE) 102 | 103 | defaults = list( 104 | lr = lr, 105 | betas = betas, 106 | eps = eps, 107 | weight_decay = weight_decay 108 | ) 109 | super$initialize(params, defaults) 110 | }, 111 | step = function(closure = NULL){ 112 | loop_fun <- function(group, param, g, p) { 113 | if (is.null(param$grad)) 114 | next 115 | grad <- param$grad 116 | 117 | # State initialization 118 | if (length(state(param)) == 0) { 119 | state(param) <- list() 120 | state(param)[["step"]] <- 0 121 | # Exponential moving average of gradient values 122 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 123 | # Exponential moving average of squared gradient values 124 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param) 125 | } 126 | # Define variables for optimization function 127 | exp_avg <- state(param)[["exp_avg"]] 128 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 129 | beta1 <- group[['betas']][[1]] 130 | beta2 <- group[['betas']][[2]] 131 | weight_decay <- group[['weight_decay']] 132 | eps <- group[["eps"]] 133 | lr <- group[['lr']] 134 | 135 | # take one step 136 | state(param)[["step"]] <- state(param)[["step"]] + 1 137 | step <- state(param)[["step"]] 138 | 139 | # bias correction 140 | bias_correction1 <- 1 - beta1^state(param)[['step']] 141 | bias_correction2 <- 1 - beta2^state(param)[['step']] 142 | 143 | # L2 correction 144 | if (weight_decay != 0) 145 | grad$add_(param, alpha = weight_decay) 146 | 147 | 148 | # Decay the first moment 149 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 150 | # Decay the second moment 151 | exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2)) 152 | 153 | # correcting bias for the first moving moment 154 | bias_corrected_exp_avg <- exp_avg / bias_correction1 155 | 156 | # maximum length of the approximated SMA 157 | rho_inf <- 2 / (1 - beta2) - 1 158 | # compute the length of the approximated SMA 159 | rho_t <- rho_inf - 2 * step * (beta2^step) / bias_correction2 160 | # adjust learning rate 161 | if (rho_t > 5.0) { 162 | # Compute the variance rectification term and update parameters accordingly 163 | rect <- sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / 164 | ((rho_inf - 4) * (rho_inf - 2) * rho_t)) 165 | adaptive_lr <- sqrt(bias_correction2) / exp_avg_sq$sqrt()$add_(eps) 166 | param$add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha = -1.0) 167 | } else 168 | param$add_(bias_corrected_exp_avg * lr, alpha =- 1.0) 169 | } 170 | private$step_helper(closure, loop_fun) 171 | } 172 | ) 173 | -------------------------------------------------------------------------------- /R/swats.R: -------------------------------------------------------------------------------- 1 | #' @title SWATS optimizer 2 | #' 3 | #' @name optim_swats 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Daniel Falbel, \email{daniel.falble@@gmail.com} 7 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 8 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 9 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 10 | #' 11 | #' @description 12 | #' R implementation of the SWATS optimizer proposed 13 | #' by Shekar and Sochee (2018). 14 | #' We used the implementation available at 15 | #' https://github.com/jettify/pytorch-optimizer/ 16 | #' Thanks to Nikolay Novik for providing the pytorch code. 17 | #' 18 | #' From the abstract by the paper by Shekar and Sochee (2018): 19 | #' Adaptive optimization methods such as Adam, Adagrad or RMSprop 20 | #' have been found to generalize poorly compared to 21 | #' Stochastic gradient descent (SGD). These methods tend to perform well i 22 | #' in the initial portion of training but are outperformed by SGD at 23 | #' later stages of training. We investigate a hybrid strategy that begins 24 | #' training with an adaptive method and switches to SGD 25 | #' when a triggering condition is satisfied. 26 | #' The condition we propose relates to the projection of Adam 27 | #' steps on the gradient subspace. By design, the monitoring process 28 | #' for this condition adds very little overhead and does not increase 29 | #' the number of hyperparameters in the optimizer. 30 | #' 31 | #' @references 32 | #' Nitish Shirish Keskar, Richard Socher 33 | #' "Improving Generalization Performance by Switching from Adam to SGD". 34 | #' International Conference on Learning Representations (ICLR) 2018. 35 | #' https://arxiv.org/abs/1712.07628 36 | #' 37 | #' @param params List of parameters to optimize. 38 | #' @param lr Learning rate (default: 1e-3) 39 | #' @param betas Coefficients computing running averages of gradient 40 | #' and its square (default: (0.9, 0.999)). 41 | #' @param eps Term added to the denominator to improve numerical 42 | #' stability (default: 1e-8). 43 | #' @param weight_decay Weight decay (L2 penalty) (default: 0). 44 | #' @param nesterov Enables Nesterov momentum (default: False). 45 | #' 46 | #' @returns 47 | #' A torch optimizer object implementing the `step` method. 48 | #' @examples 49 | #' if (torch::torch_is_installed()) { 50 | 51 | #' # function to demonstrate optimization 52 | #' beale <- function(x, y) { 53 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 54 | #' } 55 | #' # define optimizer 56 | #' optim <- torchopt::optim_swats 57 | #' # define hyperparams 58 | #' opt_hparams <- list(lr = 0.01) 59 | #' 60 | #' # starting point 61 | #' x0 <- 3 62 | #' y0 <- 3 63 | #' # create tensor 64 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 65 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 66 | #' # instantiate optimizer 67 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 68 | #' # run optimizer 69 | #' steps <- 400 70 | #' x_steps <- numeric(steps) 71 | #' y_steps <- numeric(steps) 72 | #' for (i in seq_len(steps)) { 73 | #' x_steps[i] <- as.numeric(x) 74 | #' y_steps[i] <- as.numeric(y) 75 | #' optim$zero_grad() 76 | #' z <- beale(x, y) 77 | #' z$backward() 78 | #' optim$step() 79 | #' } 80 | #' print(paste0("starting value = ", beale(x0, y0))) 81 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 82 | #' } 83 | #' @export 84 | optim_swats <- torch::optimizer( 85 | "optim_swats", 86 | initialize = function(params, 87 | lr = 0.01, 88 | betas = c(0.9, 0.999), 89 | eps = 1e-8, 90 | weight_decay = 0, 91 | nesterov = FALSE) { 92 | if (lr <= 0.0) 93 | stop("Learning rate must be positive.", call. = FALSE) 94 | if (eps < 0.0) 95 | stop("eps must be non-negative.", call. = FALSE) 96 | if (betas[1] > 1.0 | betas[1] <= 0.0) 97 | stop("Invalid beta parameter.", call. = FALSE) 98 | if (betas[2] > 1.0 | betas[1] <= 0.0) 99 | stop("Invalid beta parameter.", call. = FALSE) 100 | if (weight_decay < 0) 101 | stop("Invalid weight_decay value.", call. = FALSE) 102 | 103 | defaults = list( 104 | lr = lr, 105 | betas = betas, 106 | eps = eps, 107 | weight_decay = weight_decay, 108 | nesterov = nesterov, 109 | phase = "ADAM" 110 | ) 111 | super$initialize(params, defaults) 112 | }, 113 | step = function(closure = NULL){ 114 | loop_fun <- function(group, param, g, p) { 115 | if (is.null(param$grad)) 116 | next 117 | grad <- param$grad 118 | 119 | # State initialization 120 | if (length(state(param)) == 0) { 121 | state(param) <- list() 122 | state(param)[["step"]] <- 0 123 | # create momentum buffer 124 | state(param)[["momentum_buffer"]] <- NA 125 | # Exponential moving average of gradient values 126 | state(param)[["exp_avg"]] <- torch::torch_zeros_like(param) 127 | # Exponential moving average of squared gradient values 128 | state(param)[["exp_avg_sq"]] <- torch::torch_zeros_like(param) 129 | # moving average for the non-orthogonal projection scaling 130 | # state(param)[["exp_avg2"]] <- param$new(1)$fill_(0) 131 | state(param)[["exp_avg2"]] <- param$new_zeros(1) 132 | } 133 | # Define variables for optimization function 134 | exp_avg <- state(param)[["exp_avg"]] 135 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 136 | exp_avg2 <- state(param)[["exp_avg2"]] 137 | beta1 <- group[['betas']][[1]] 138 | beta2 <- group[['betas']][[2]] 139 | weight_decay <- group[['weight_decay']] 140 | eps <- group[["eps"]] 141 | lr <- group[['lr']] 142 | phase <- group[["phase"]] 143 | nesterov <- group[["nesterov"]] 144 | 145 | # take one step 146 | state(param)[["step"]] <- state(param)[["step"]] + 1 147 | step <- state(param)[["step"]] 148 | 149 | # L2 correction 150 | if (weight_decay != 0) 151 | grad$add_(param, alpha = weight_decay) 152 | 153 | # if its SGD phase, take an SGD update and continue 154 | if (phase == 'SGD'){ 155 | if (is.na(state(param)[["momentum_buffer"]])) { 156 | state(param)[["momentum_buffer"]] <- 157 | torch::torch_clone(grad)$detach() 158 | buf <- state(param)[["momentum_buffer"]] 159 | } else { 160 | buf <- state(param)[["momentum_buffer"]] 161 | buf$mul_(beta1)$add_(grad) 162 | grad <- buf 163 | grad$mul_(1 - beta1) 164 | if (nesterov) 165 | grad$add_(buf, alpha = beta1) 166 | param$add_(grad, alpha = -lr) 167 | next 168 | } 169 | } 170 | 171 | # Decay the first moment 172 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 173 | # Decay the second moment 174 | exp_avg_sq$mul_(beta2)$addcmul_(grad, grad, value = (1 - beta2)) 175 | # calculate denominator 176 | denom = exp_avg_sq$sqrt()$add_(eps) 177 | 178 | # bias correction 179 | bias_correction1 <- 1 - beta1^state(param)[['step']] 180 | bias_correction2 <- 1 - beta2^state(param)[['step']] 181 | 182 | # calculate step size 183 | step_size <- lr * (bias_correction2 ^ 0.5) / bias_correction1 184 | 185 | pf <- -step_size * (exp_avg / denom) 186 | param$add_(pf) 187 | 188 | p_view <- pf$view(-1) 189 | pg <- p_view$dot(grad$view(-1)) 190 | 191 | if (as.logical(pg != 0)) { 192 | # the non-orthognal scaling estimate 193 | scaling <- p_view$dot(p_view) / -pg 194 | exp_avg2$mul_(beta2)$add_(scaling, alpha = (1 - beta2)) 195 | 196 | # bias corrected exponential average 197 | corrected_exp_avg <- exp_avg2 / bias_correction2 198 | 199 | # checking criteria of switching to SGD training 200 | if (as.logical(state(param)[['step']] > 1) && 201 | as.logical(corrected_exp_avg$allclose(scaling, rtol = 1e-6)) && 202 | as.logical(corrected_exp_avg > 0) 203 | ) { 204 | group[['phase']] <- 'SGD' 205 | group[['lr']] <- corrected_exp_avg$item() 206 | } 207 | } 208 | } 209 | private$step_helper(closure, loop_fun) 210 | } 211 | ) 212 | -------------------------------------------------------------------------------- /R/torchopt-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | #' @importFrom graphics contour 6 | #' @importFrom graphics image 7 | #' @importFrom graphics lines 8 | #' @importFrom graphics points 9 | #' @importFrom grDevices hcl.colors 10 | #' @importFrom stats runif 11 | ## usethis namespace: end 12 | NULL 13 | 14 | # Include the following global variables 15 | utils::globalVariables(c("self", "super", "ctx", "private")) 16 | 17 | -------------------------------------------------------------------------------- /R/utils-state.R: -------------------------------------------------------------------------------- 1 | #' @title Imported function 2 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com} 3 | #' @keywords internal 4 | #' @description Code lifted from a internal function of madgrad package. 5 | #' Get 'state' attribute of an object. 6 | state <- function(self) { 7 | attr(self, "state") 8 | } 9 | 10 | #' @title Imported function 11 | #' @author Daniel Falbel, \email{dfalbel@@gmail.com} 12 | #' @keywords internal 13 | #' @description Code lifted from a internal function of madgrad package. 14 | #' Set 'state' attribute of an object. 15 | `state<-` <- function(self, value) { 16 | attr(self, "state") <- value 17 | self 18 | } 19 | -------------------------------------------------------------------------------- /R/utils-testopt.R: -------------------------------------------------------------------------------- 1 | ackley <- function(x,y) { 2 | -20 * exp(-0.2*sqrt(0.5*(x^2 + y^2))) - exp(0.5*(cos(2*pi*x) + cos(2*pi*y))) + exp(1) + 20 3 | } 4 | domain_ackley <- function(){ 5 | x0 <- runif(1,-5, 5) 6 | y0 <- runif(1,-5, 5) 7 | return(c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)) 8 | } 9 | 10 | beale <- function(x, y) { 11 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 12 | } 13 | domain_beale <- function(){ 14 | x0 <- runif(1,-4.5, 4.5) 15 | y0 <- runif(1,-4.5, 4.5) 16 | return(c(x0 = x0, y0 = y0, xmax = 4.5, xmin = -4.5, ymax = 4.5, ymin = -4.5)) 17 | } 18 | 19 | booth <- function(x, y) { 20 | log((x + 2 * y - 7)^2 + (2 * x + y - 5)^2) 21 | } 22 | domain_booth <- function(){ 23 | x0 <- runif(1,-10, 10) 24 | y0 <- runif(1,-10, 10) 25 | return(c(x0 = x0, y0 = y0, xmax = 10, xmin = -10, ymax = 10, ymin = -10)) 26 | } 27 | 28 | bukin_n6 <- function(x, y) { 29 | 100 * sqrt(abs(y - 0.01 * x^2)) + 0.01 * abs(x + 10) 30 | } 31 | domain_bukin_n6 <- function(){ 32 | x0 <- runif(1,-15, -5) 33 | y0 <- runif(1,-3, 3) 34 | return(c(x0 = x0, y0 = y0, xmax = -5, xmin = -15, ymax = -3, ymin = 3)) 35 | } 36 | 37 | easom <- function(x, y) { 38 | -cos(x) * cos(y) * exp(-(x - pi)^2 - (y - pi)^2) 39 | } 40 | domain_easom <- function(){ 41 | x0 <- runif(1,-1, 7) 42 | y0 <- runif(1,-1, 7) 43 | return(c(x0 = x0, y0 = y0, xmax = 7, xmin = -1, ymax = 7, ymin = -1)) 44 | } 45 | goldstein_price <- function(x, y) { 46 | log((1 + (x + y + 1)^2 * 47 | (19 - 14 * x + 3 * x^2 - 14 * y + 6 * x * y + 3 * y^2)) * 48 | (30 + (2 * x - 3 * y)^2 * (18 - 32 * x + 12 * x^2 + 48 * 49 | y - 36 * x * y + 27 * y^2))) 50 | } 51 | domain_goldstein_price <- function(){ 52 | x0 <- runif(1,-2, 2) 53 | y0 <- runif(1,-3, 1) 54 | return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = -3, ymin = 1)) 55 | } 56 | himmelblau <- function(x, y) { 57 | log((x^2 + y - 11)^2 + (x + y^2 - 7)^2) 58 | } 59 | domain_himmelblau <- function(){ 60 | x0 <- runif(1,-5, 5) 61 | y0 <- runif(1,-5, 5) 62 | return(c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)) 63 | } 64 | levi_n13 <- function(x, y) { 65 | sin(3 * pi * x)^2 + (x - 1)^2 * (1 + sin(3 * pi * y)^2) + 66 | (y - 1)^2 * (1 + sin(2 * pi * y)^2) 67 | } 68 | domain_levi_n13 <- function(){ 69 | x0 <- runif(1,-5, 7) 70 | y0 <- runif(1,-5, 7) 71 | return(c(x0 = x0, y0 = y0, xmax = 7, xmin = -5, ymax = 7, ymin = -5)) 72 | } 73 | matyas <- function(x, y) { 74 | log(0.26 * (x^2 + y^2) - 0.48 * x * y) 75 | } 76 | domain_matyas <- function(){ 77 | x0 <- runif(1,-10, 10) 78 | y0 <- runif(1,-10, 10) 79 | return(c(x0 = x0, y0 = y0, xmax = 10, xmin = -10, ymax = 10, ymin = -10)) 80 | } 81 | rastrigin <- function(x, y) { 82 | 20 + (x^2 - 10 * cos(2 * pi * x)) + (y^2 - 10 * cos(2 * pi * y)) 83 | } 84 | domain_rastrigin <- function(){ 85 | x0 <- runif(1,-5.12, 5.12) 86 | y0 <- runif(1,-5.12, 5.12) 87 | return(c(x0 = x0, y0 = y0, xmax = 5.12, xmin = -5.12, ymax = 5.12, ymin = -5.12)) 88 | } 89 | rosenbrock <- function(x, y) { 90 | log(100 * (y - x^2)^2 + (1 - x)^2) 91 | } 92 | domain_rosenbrock <- function(){ 93 | x0 <- -2 94 | y0 <- 2 95 | return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = 3, ymin = -1)) 96 | } 97 | sphere <- function(x, y) { 98 | x^2 + y^2 99 | } 100 | domain_sphere <- function(){ 101 | x0 <- runif(1,-2, 2) 102 | y0 <- runif(1,-2, 2) 103 | return(c(x0 = x0, y0 = y0, xmax = 2, xmin = -2, ymax = 2, ymin = -2)) 104 | } 105 | #' @title Test optimization function 106 | #' 107 | #' @name test_optim 108 | #' 109 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 110 | #' 111 | #' @description 112 | #' `test_optim()` function is useful to visualize how optimizers solve the 113 | #' minimization problem by showing the convergence path using a test function. 114 | #' User can choose any test optimization 115 | #' [functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization) 116 | #' provided by `torchopt`: 117 | #' 118 | #' `"beale"`, `"booth"`, `"bukin_n6"`, `"easom"`, `"goldstein_price"`, 119 | #' `"himmelblau"`, `"levi_n13"`, `"matyas"`, `"rastrigin"`, 120 | #' `"rosenbrock"`, and `"sphere"`. 121 | #' 122 | #' Besides these functions, users can pass any function that receives two 123 | #' numerical values and returns a scalar. 124 | #' 125 | #' Optimization functions are useful to evaluate characteristics of optimization 126 | #' algorithms, such as convergence rate, precision, robustness, and performance. 127 | #' These functions give an idea about the different situations that optimization 128 | #' algorithms can face. 129 | #' 130 | #' Function `test_function()` plot the 2D-space of a test optimization function. 131 | #' 132 | #' @param optim Torch optimizer function. 133 | #' @param ... Additional parameters (passed to `image` function). 134 | #' @param opt_hparams A list with optimizer initialization parameters (default: `list()`). 135 | #' If missing, for each optimizer its individual defaults will be used. 136 | #' @param test_fn A test function (default `"beale"`). You can also pass 137 | #' a list with 2 elements. The first should be a function that will be optimized 138 | #' and the second is a function that returns a named vector with `x0`, `y0` 139 | #' (the starting points) and `xmax`, `xmin`, `ymax` and `ymin` (the domain). 140 | #' An example: `c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)` 141 | #' @param steps Number of steps to run (default `200`). 142 | #' @param pt_start_color Starting point color (default `"#5050FF7F"`) 143 | #' @param pt_end_color Ending point color (default `"#FF5050FF"`) 144 | #' @param ln_color Line path color (default `"#FF0000FF"`) 145 | #' @param ln_weight Line path weight (default `2`) 146 | #' @param bg_xy_breaks Background X and Y resolution (default `100`) 147 | #' @param bg_z_breaks Background Z resolution (default `32`) 148 | #' @param bg_palette Background palette (default `"viridis"`) 149 | #' @param ct_levels Contour levels (default `10`) 150 | #' @param ct_labels Should show contour labels? (default `FALSE`) 151 | #' @param ct_color Contour color (default `"#FFFFFF7F"`) 152 | #' @param plot_each_step Should output each step? (default `FALSE`) 153 | #' 154 | #' @return No return value, called for producing animated gifs 155 | #' 156 | #' @export 157 | test_optim <- function(optim, ..., 158 | opt_hparams = list(), 159 | test_fn = "beale", 160 | steps = 200, 161 | pt_start_color = "#5050FF7F", 162 | pt_end_color = "#FF5050FF", 163 | ln_color = "#FF0000FF", 164 | ln_weight = 2, 165 | bg_xy_breaks = 100, 166 | bg_z_breaks = 32, 167 | bg_palette = "viridis", 168 | ct_levels = 10, 169 | ct_labels = FALSE, 170 | ct_color = "#FFFFFF7F", 171 | plot_each_step = FALSE) { 172 | 173 | # pre-conditions 174 | inherits_from <- if (utils::packageVersion("torch") > '0.7.2') "torch_optimizer_generator" else "function" 175 | if (!inherits(optim, inherits_from)) { 176 | 177 | stop("invalid 'optim' param.", call. = FALSE) 178 | } 179 | if (is.character(test_fn)) { 180 | if (!exists(test_fn, 181 | envir = asNamespace("torchopt"), 182 | inherits = FALSE)) { 183 | stop("invalid 'test_fn' param.", call. = FALSE) 184 | } 185 | # get starting points 186 | domain_fn <- get(paste0("domain_",test_fn), 187 | envir = asNamespace("torchopt"), 188 | inherits = FALSE) 189 | # get gradient function 190 | test_fn <- get(test_fn, 191 | envir = asNamespace("torchopt"), 192 | inherits = FALSE) 193 | } else if (is.list(test_fn)) { 194 | domain_fn <- test_fn[[2]] 195 | test_fn <- test_fn[[1]] 196 | } 197 | 198 | if (!is.function(test_fn)) { 199 | stop("invalid 'test_fn' param.", call. = FALSE) 200 | } 201 | if (!is.function(domain_fn)) { 202 | stop("missing domain param for function.", call. = FALSE) 203 | } 204 | # starting point 205 | dom <- domain_fn() 206 | x0 <- dom[["x0"]] 207 | y0 <- dom[["y0"]] 208 | # create tensor 209 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 210 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 211 | 212 | # instantiate optimizer 213 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 214 | grad_keep <- FALSE 215 | if (inherits(optim, "optim_adahessian")) { 216 | grad_keep <- TRUE 217 | # retain_graph is not exposed before torch 0.7.2 218 | if (!utils::packageVersion("torch") > '0.7.2') { 219 | stop("adahessian needs torch version > 0.7.2, got ", 220 | utils::packageVersion("torch")) 221 | } 222 | } 223 | # run optimizer 224 | x_steps <- numeric(steps) 225 | y_steps <- numeric(steps) 226 | for (i in seq_len(steps)) { 227 | x_steps[i] <- as.numeric(x) 228 | y_steps[i] <- as.numeric(y) 229 | optim$zero_grad() 230 | z <- test_fn(x, y) 231 | # retain_graph is not exposed before torch 0.7.2 232 | if (utils::packageVersion("torch") > '0.7.2') { 233 | z$backward(create_graph = grad_keep, retain_graph = grad_keep) 234 | } else { 235 | z$backward(create_graph = grad_keep) 236 | } 237 | optim$step() 238 | } 239 | 240 | # prepare plot 241 | # get xy limits 242 | 243 | xmax <- dom[["xmax"]] 244 | xmin <- dom[["xmin"]] 245 | ymax <- dom[["ymax"]] 246 | ymin <- dom[["ymin"]] 247 | 248 | # prepare data for gradient plot 249 | x <- seq(xmin, xmax, length.out = bg_xy_breaks) 250 | y <- seq(xmin, xmax, length.out = bg_xy_breaks) 251 | z <- outer(X = x, Y = y, FUN = function(x, y) as.numeric(test_fn(x, y))) 252 | 253 | plot_from_step <- steps 254 | if (plot_each_step) { 255 | plot_from_step <- 1 256 | } 257 | 258 | for (step in seq(plot_from_step, steps, 1)) { 259 | 260 | # plot background 261 | image( 262 | x = x, 263 | y = y, 264 | z = z, 265 | col = hcl.colors( 266 | n = bg_z_breaks, 267 | palette = bg_palette 268 | ), 269 | ... 270 | ) 271 | 272 | # plot contour 273 | if (ct_levels > 0) { 274 | contour( 275 | x = x, 276 | y = y, 277 | z = z, 278 | nlevels = ct_levels, 279 | drawlabels = ct_labels, 280 | col = ct_color, 281 | add = TRUE 282 | ) 283 | } 284 | 285 | # plot starting point 286 | points( 287 | x_steps[1], 288 | y_steps[1], 289 | pch = 21, 290 | bg = pt_start_color 291 | ) 292 | 293 | # plot path line 294 | lines( 295 | x_steps[seq_len(step)], 296 | y_steps[seq_len(step)], 297 | lwd = ln_weight, 298 | col = ln_color 299 | ) 300 | 301 | # plot end point 302 | points( 303 | x_steps[step], 304 | y_steps[step], 305 | pch = 21, 306 | bg = pt_end_color 307 | ) 308 | } 309 | } 310 | 311 | 312 | -------------------------------------------------------------------------------- /R/yogi.R: -------------------------------------------------------------------------------- 1 | #' @title Yogi optimizer 2 | #' 3 | #' @name optim_yogi 4 | #' 5 | #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br} 6 | #' @author Rolf Simoes, \email{rolf.simoes@@inpe.br} 7 | #' @author Felipe Souza, \email{lipecaso@@gmail.com} 8 | #' @author Alber Sanchez, \email{alber.ipia@@inpe.br} 9 | #' 10 | #' @description 11 | #' R implementation of the Yogi optimizer proposed 12 | #' by Zaheer et al.(2019). We used the implementation available at 13 | #' https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py. 14 | #' Thanks to Nikolay Novik for providing the pytorch code. 15 | #' 16 | #' The original implementation is licensed using the Apache-2.0 software license. 17 | #' This implementation is also licensed using Apache-2.0 license. 18 | #' 19 | #' From the abstract by the paper by Zaheer et al.(2019): 20 | #' Adaptive gradient methods that rely on scaling gradients 21 | #' down by the square root of exponential moving averages 22 | #' of past squared gradients, such RMSProp, Adam, Adadelta have 23 | #' found wide application in optimizing the nonconvex problems 24 | #' that arise in deep learning. However, it has been recently 25 | #' demonstrated that such methods can fail to converge even 26 | #' in simple convex optimization settings. 27 | #' Yogi is a new adaptive optimization algorithm, 28 | #' which controls the increase in effective learning rate, 29 | #' leading to even better performance with similar theoretical 30 | #' guarantees on convergence. Extensive experiments show that 31 | #' Yogi with very little hyperparameter tuning outperforms 32 | #' methods such as Adam in several challenging machine learning tasks. 33 | #' 34 | #' 35 | #' @references 36 | #' Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv Kumar, 37 | #' "Adaptive Methods for Nonconvex Optimization", 38 | #' Advances in Neural Information Processing Systems 31 (NeurIPS 2018). 39 | #' https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization 40 | #' 41 | #' @param params List of parameters to optimize. 42 | #' @param lr Learning rate (default: 1e-3) 43 | #' @param betas Coefficients computing running averages of gradient 44 | #' and its square (default: (0.9, 0.999)) 45 | #' @param eps Term added to the denominator to improve numerical 46 | #' stability (default: 1e-8) 47 | #' @param initial_accumulator Initial values for first and 48 | #' second moments. 49 | #' @param weight_decay Weight decay (L2 penalty) (default: 0) 50 | #' 51 | #' @returns 52 | #' A torch optimizer object implementing the `step` method. 53 | #' 54 | #' @examples 55 | #' if (torch::torch_is_installed()) { 56 | 57 | #' # function to demonstrate optimization 58 | #' beale <- function(x, y) { 59 | #' log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 60 | #' } 61 | #' # define optimizer 62 | #' optim <- torchopt::optim_yogi 63 | #' # define hyperparams 64 | #' opt_hparams <- list(lr = 0.01) 65 | #' 66 | #' # starting point 67 | #' x0 <- 3 68 | #' y0 <- 3 69 | #' # create tensor 70 | #' x <- torch::torch_tensor(x0, requires_grad = TRUE) 71 | #' y <- torch::torch_tensor(y0, requires_grad = TRUE) 72 | #' # instantiate optimizer 73 | #' optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 74 | #' # run optimizer 75 | #' steps <- 400 76 | #' x_steps <- numeric(steps) 77 | #' y_steps <- numeric(steps) 78 | #' for (i in seq_len(steps)) { 79 | #' x_steps[i] <- as.numeric(x) 80 | #' y_steps[i] <- as.numeric(y) 81 | #' optim$zero_grad() 82 | #' z <- beale(x, y) 83 | #' z$backward() 84 | #' optim$step() 85 | #' } 86 | #' print(paste0("starting value = ", beale(x0, y0))) 87 | #' print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 88 | #' } 89 | #' @export 90 | optim_yogi <- torch::optimizer( 91 | "optim_yogi", 92 | initialize = function(params, 93 | lr = 0.01, 94 | betas = c(0.9, 0.999), 95 | eps = 0.001, 96 | initial_accumulator = 1e-6, 97 | weight_decay = 0) { 98 | if (lr <= 0.0) 99 | stop("Learning rate must be positive.", call. = FALSE) 100 | if (eps < 0.0) 101 | stop("eps must be non-negative.", call. = FALSE) 102 | if (betas[1] > 1.0 | betas[1] <= 0.0) 103 | stop("Invalid beta parameter.", call. = FALSE) 104 | if (betas[2] > 1.0 | betas[1] <= 0.0) 105 | stop("Invalid beta parameter.", call. = FALSE) 106 | if (weight_decay < 0) 107 | stop("Invalid weight_decay value.", call. = FALSE) 108 | 109 | defaults = list( 110 | lr = lr, 111 | betas = betas, 112 | eps = eps, 113 | weight_decay = weight_decay, 114 | initial_accumulator = initial_accumulator 115 | ) 116 | super$initialize(params, defaults) 117 | }, 118 | step = function(closure = NULL) { 119 | loop_fun <- function(group, param, g, p) { 120 | if (is.null(param$grad)) 121 | next 122 | grad <- param$grad 123 | 124 | # get value of initial accumulator 125 | init_acc <- group[["initial_accumulator"]] 126 | 127 | # State initialization 128 | if (length(state(param)) == 0) { 129 | state(param) <- list() 130 | state(param)[["step"]] <- 0 131 | # Exponential moving average of gradient values 132 | state(param)[["exp_avg"]] <- torch::nn_init_constant_( 133 | torch::torch_empty_like( 134 | param, 135 | memory_format = torch::torch_preserve_format() 136 | ), 137 | init_acc 138 | ) 139 | # Exponential moving average of squared gradient values 140 | state(param)[["exp_avg_sq"]] <- torch::nn_init_constant_( 141 | torch::torch_empty_like( 142 | param, 143 | memory_format = torch::torch_preserve_format() 144 | ), 145 | init_acc 146 | ) 147 | } 148 | # Define variables for optimization function 149 | exp_avg <- state(param)[["exp_avg"]] 150 | exp_avg_sq <- state(param)[["exp_avg_sq"]] 151 | beta1 <- group[['betas']][[1]] 152 | beta2 <- group[['betas']][[2]] 153 | weight_decay <- group[['weight_decay']] 154 | eps <- group[["eps"]] 155 | lr <- group[['lr']] 156 | 157 | # take one step 158 | state(param)[["step"]] <- state(param)[["step"]] + 1 159 | # bias correction 160 | bias_correction1 <- 1 - beta1^state(param)[['step']] 161 | bias_correction2 <- 1 - beta2^state(param)[['step']] 162 | 163 | # L2 correction 164 | if (weight_decay != 0) 165 | grad <- grad$add(p, alpha = weight_decay) 166 | 167 | # Decay the first moment 168 | exp_avg$mul_(beta1)$add_(grad, alpha = 1 - beta1) 169 | # Decay the second moment 170 | grad_squared <- grad$mul(grad) 171 | exp_avg_sq$addcmul_( 172 | torch::torch_sign(exp_avg_sq - grad_squared), 173 | grad_squared, 174 | value = -(1 - beta2) 175 | ) 176 | 177 | # calculate denominator 178 | denom = (exp_avg_sq$sqrt() / sqrt(bias_correction2))$add_(eps) 179 | 180 | # calculate step size 181 | step_size <- lr / bias_correction1 182 | # go to next step 183 | param$addcdiv_(exp_avg, denom, value = -step_size) 184 | } 185 | 186 | private$step_helper(closure, loop_fun) 187 | } 188 | ) 189 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | editor_options: 4 | chunk_output_type: console 5 | markdown: 6 | wrap: 72 7 | --- 8 | 9 | 10 | 11 | ```{r, include = FALSE} 12 | knitr::opts_chunk$set( 13 | collapse = TRUE, 14 | comment = "#>", 15 | fig.path = "man/figures/README-", 16 | out.width = "100%" 17 | ) 18 | ``` 19 | 20 | # torchopt 21 | 22 | 23 | 24 | [![R-CMD-check](https://github.com/e-sensing/torchopt/workflows/R-CMD-check/badge.svg)](https://github.com/e-sensing/torchopt/actions) 25 | [![CRAN 26 | status](https://www.r-pkg.org/badges/version/torchopt)](https://cran.r-project.org/package=torchopt) 27 | [![Software Life 28 | Cycle](https://img.shields.io/badge/lifecycle-experimental-yellow.svg)](https://lifecycle.r-lib.org/articles/stages.html) 29 | [![Software 30 | License](https://img.shields.io/badge/license-Apache%202-2--green)](https://www.apache.org/licenses/LICENSE-2.0) 31 | 32 | 33 | 34 | The `torchopt` package provides R implementation of deep learning optimizers proposed in the literature. It is intended to support the use of the torch package in R. 35 | 36 | ## Installation 37 | 38 | Installing the CRAN (stable) version of `torchopt`: 39 | 40 | ```{r, eval = FALSE} 41 | install.packages("torchopt") 42 | ``` 43 | 44 | Installing the development version of `torchopt` do as : 45 | 46 | ```{r, eval = FALSE} 47 | library(devtools) 48 | install_github("e-sensing/torchopt") 49 | ``` 50 | 51 | ```{r, echo = FALSE} 52 | library(torch) 53 | if (!torch::torch_is_installed()) 54 | torch::install_torch() 55 | library(torchopt) 56 | ``` 57 | 58 | ## Provided optimizers 59 | 60 | `torchopt` package provides the following R implementations of torch 61 | optimizers: 62 | 63 | - `optim_adamw()`: AdamW optimizer proposed by Loshchilov & Hutter 64 | (2019). Converted from the `pytorch` code developed by Collin 65 | Donahue-Oponski available at 66 | 67 | 68 | - `optim_adabelief()`: Adabelief optimizer proposed by Zhuang et al 69 | (2020). Converted from the authors' PyTorch code: 70 | . 71 | 72 | - `optim_adabound()`: Adabound optimizer proposed by Luo et al.(2019). 73 | Converted from the authors' PyTorch code: 74 | . 75 | 76 | - `optim_adahessian()`: Adahessian optimizer proposed by Yao et al.(2021). 77 | Converted from the authors' PyTorch code: 78 | . 79 | 80 | - `optim_madgrad()`: Momentumized, Adaptive, Dual Averaged Gradient 81 | Method for Stochastic Optimization (MADGRAD) optimizer proposed by 82 | Defazio & Jelassi (2021). The function is imported from 83 | [madgrad](https://CRAN.R-project.org/package=madgrad) package and 84 | the source code is available at 85 | 86 | - `optim_nadam()`: Incorporation of Nesterov Momentum into Adam 87 | proposed by Dozat (2016). Converted from the PyTorch site 88 | . 89 | 90 | - `optim_qhadam()`: Quasi-hyperbolic version of Adam proposed by Ma 91 | and Yarats(2019). Converted from the code developed by Meta AI: 92 | . 93 | 94 | - `optim_radam()`: Rectified verison of Adam proposed by Liu et al. 95 | (2019). Converted from the PyTorch code 96 | . 97 | 98 | - `optim_swats()`: Optimizer that switches from Adam to SGD proposed by 99 | Keskar and Socher(2018). 100 | Converted from the `pytorch` code developed by Patrik Purgai: 101 | 102 | 103 | - `optim_yogi()`: Yogi optimizer proposed by Zaheer et al.(2019). 104 | Converted from the `pytorch` code developed by Nikolay Novik: 105 | 106 | 107 | ## Optimization test functions 108 | 109 | You can also test optimizers using optimization [test 110 | functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization) 111 | provided by `torchopt` including `"ackley"`, `"beale"`, `"booth"`, 112 | `"bukin_n6"`, `"easom"`, `"goldstein_price"`, `"himmelblau"`, 113 | `"levi_n13"`, `"matyas"`, `"rastrigin"`, `"rosenbrock"`, `"sphere"`. 114 | Optimization functions are useful to evaluate characteristics of 115 | optimization algorithms, such as convergence rate, precision, 116 | robustness, and performance. These functions give an idea about the 117 | different situations that optimization algorithms can face. 118 | 119 | In what follows, we perform tests using `"beale"` test function. To 120 | visualize an animated GIF, we set `plot_each_step=TRUE` and capture each 121 | step frame using [gifski](https://CRAN.R-project.org/package=gifski) 122 | package. 123 | 124 | ### `optim_adamw()`: 125 | 126 | ```{r test_adamw, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 127 | 128 | # test optim adamw 129 | set.seed(12345) 130 | torchopt::test_optim( 131 | optim = torchopt::optim_adamw, 132 | test_fn = "beale", 133 | opt_hparams = list(lr = 0.1), 134 | steps = 500, 135 | plot_each_step = TRUE 136 | ) 137 | 138 | ``` 139 | 140 | ### `optim_adabelief()`: 141 | 142 | ```{r test_adabelief, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 143 | 144 | set.seed(42) 145 | test_optim( 146 | optim = optim_adabelief, 147 | opt_hparams = list(lr = 0.5), 148 | steps = 400, 149 | test_fn = "beale", 150 | plot_each_step = TRUE 151 | ) 152 | ``` 153 | 154 | ### `optim_adabound()`: 155 | 156 | ```{r test_adabound, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 157 | 158 | # set manual seed 159 | set.seed(22) 160 | test_optim( 161 | optim = optim_adabound, 162 | opt_hparams = list(lr = 0.5), 163 | steps = 400, 164 | test_fn = "beale", 165 | plot_each_step = TRUE 166 | ) 167 | 168 | ``` 169 | 170 | ### `optim_adahessian()`: 171 | 172 | ```{r test_adahessian, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 173 | 174 | # set manual seed 175 | set.seed(290356) 176 | test_optim( 177 | optim = optim_adahessian, 178 | opt_hparams = list(lr = 0.2), 179 | steps = 500, 180 | test_fn = "beale", 181 | plot_each_step = TRUE 182 | ) 183 | 184 | ``` 185 | 186 | ### `optim_madgrad()`: 187 | 188 | ```{r test_madgrad, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 189 | 190 | set.seed(256) 191 | test_optim( 192 | optim = optim_madgrad, 193 | opt_hparams = list(lr = 0.05), 194 | steps = 400, 195 | test_fn = "beale", 196 | plot_each_step = TRUE 197 | ) 198 | 199 | ``` 200 | 201 | ### `optim_nadam()`: 202 | 203 | ```{r test_nadam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 204 | 205 | set.seed(2903) 206 | test_optim( 207 | optim = optim_nadam, 208 | opt_hparams = list(lr = 0.5, weight_decay = 0), 209 | steps = 500, 210 | test_fn = "beale", 211 | plot_each_step = TRUE 212 | ) 213 | 214 | ``` 215 | 216 | ### `optim_qhadam()`: 217 | 218 | ```{r test_qhadam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 219 | 220 | set.seed(1024) 221 | test_optim( 222 | optim = optim_qhadam, 223 | opt_hparams = list(lr = 0.1), 224 | steps = 500, 225 | test_fn = "beale", 226 | plot_each_step = TRUE 227 | ) 228 | 229 | ``` 230 | 231 | 232 | ### `optim_radam()`: 233 | 234 | ```{r test_radam, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 235 | 236 | set.seed(1024) 237 | test_optim( 238 | optim = optim_radam, 239 | opt_hparams = list(lr = 1.0), 240 | steps = 500, 241 | test_fn = "beale", 242 | plot_each_step = TRUE 243 | ) 244 | 245 | ``` 246 | 247 | 248 | ### `optim_swats()`: 249 | 250 | ```{r test_swats, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 251 | 252 | set.seed(234) 253 | test_optim( 254 | optim = optim_swats, 255 | opt_hparams = list(lr = 0.5), 256 | steps = 500, 257 | test_fn = "beale", 258 | plot_each_step = TRUE 259 | ) 260 | 261 | ``` 262 | 263 | ### `optim_yogi()`: 264 | 265 | ```{r test_yogi, echo=TRUE, fig.show='animate', fig.height=8, fig.width=8, animation.hook='gifski', aniopts='loop', dpi=96, interval=0.1, out.height='50%', out.width='50%', cache=TRUE} 266 | 267 | # set manual seed 268 | set.seed(66) 269 | test_optim( 270 | optim = optim_yogi, 271 | opt_hparams = list(lr = 0.1), 272 | steps = 500, 273 | test_fn = "beale", 274 | plot_each_step = TRUE 275 | ) 276 | 277 | ``` 278 | 279 | ## Acknowledgements 280 | 281 | We are thankful to Collin Donahue-Oponski , 282 | Amir Gholami , 283 | Liangchen Luo , Liyuan Liu 284 | , Nikolay Novik , Patrik Purgai Juntang Zhuang and the PyTorch team for providing pytorch code for the optimizers implemented in this package. We also thank Daniel Falbel for providing support 285 | for the R version of PyTorch. 286 | 287 | ## Code of Conduct 288 | 289 | The torchopt project is released with a [Contributor 290 | Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). 291 | By contributing to this project, you agree to abide by its terms. 292 | 293 | ## References 294 | 295 | - ADABELIEF: Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda, Nicha 296 | Dvornek, Xenophon Papademetris, James S. Duncan. "Adabelief 297 | Optimizer: Adapting Stepsizes by the Belief in Observed Gradients", 298 | 34th Conference on Neural Information Processing Systems (NeurIPS 299 | 2020), . 300 | 301 | - ADABOUND: Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, "Adaptive Gradient 302 | Methods with Dynamic Bound of Learning Rate", International 303 | Conference on Learning Representations (ICLR), 2019. 304 | . 305 | 306 | - ADAHESSIAN: Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa, Kurt Keutzer, 307 | Michael W. Mahoney. "Adahessian: An Adaptive Second Order Optimizer 308 | for Machine Learning", AAAI Conference on Artificial Intelligence, 35(12), 309 | 10665-10673, 2021. . 310 | 311 | - ADAMW: Ilya Loshchilov, Frank Hutter, "Decoupled Weight Decay 312 | Regularization", International Conference on Learning 313 | Representations (ICLR) 2019. 314 | . 315 | 316 | - MADGRAD: Aaron Defazio, Samy Jelassi, "Adaptivity without Compromise: A 317 | Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic 318 | Optimization", arXiv preprint arXiv:2101.11075, 2021. 319 | 320 | 321 | - NADAM: Timothy Dazat, "Incorporating Nesterov Momentum into Adam", 322 | International Conference on Learning Representations (ICLR), 2019. 323 | 324 | 325 | - QHADAM: Jerry Ma, Denis Yarats, "Quasi-hyperbolic momentum and Adam 326 | for deep learning". 327 | 328 | - RADAM: Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu, 329 | Jianfeng Gao, Jiawei Han, "On the Variance of the Adaptive Learning 330 | Rate and Beyond", International Conference on Learning 331 | Representations (ICLR) 2020. . 332 | 333 | - SWATS: Nitish Keskar, Richard Socher, "Improving Generalization Performance 334 | by Switching from Adam to SGD". 335 | International Conference on Learning Representations (ICLR), 2018. 336 | . 337 | 338 | - YOGI: Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv 339 | Kumar, "Adaptive Methods for Nonconvex Optimization", Advances in 340 | Neural Information Processing Systems 31 (NeurIPS 2018). 341 | 342 | 343 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # torchopt 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/e-sensing/torchopt/workflows/R-CMD-check/badge.svg)](https://github.com/e-sensing/torchopt/actions) 9 | [![CRAN 10 | status](https://www.r-pkg.org/badges/version/torchopt)](https://cran.r-project.org/package=torchopt) 11 | [![Software Life 12 | Cycle](https://img.shields.io/badge/lifecycle-experimental-yellow.svg)](https://lifecycle.r-lib.org/articles/stages.html) 13 | [![Software 14 | License](https://img.shields.io/badge/license-Apache%202-2--green)](https://www.apache.org/licenses/LICENSE-2.0) 15 | 16 | 17 | 18 | The `torchopt` package provides R implementation of deep learning 19 | optimizers proposed in the literature. It is intended to support the use 20 | of the torch package in R. 21 | 22 | ## Installation 23 | 24 | Installing the CRAN (stable) version of `torchopt`: 25 | 26 | ``` r 27 | install.packages("torchopt") 28 | ``` 29 | 30 | Installing the development version of `torchopt` do as : 31 | 32 | ``` r 33 | library(devtools) 34 | install_github("e-sensing/torchopt") 35 | ``` 36 | 37 | #> Warning: package 'torch' was built under R version 4.1.3 38 | 39 | ## Provided optimizers 40 | 41 | `torchopt` package provides the following R implementations of torch 42 | optimizers: 43 | 44 | - `optim_adamw()`: AdamW optimizer proposed by Loshchilov & Hutter 45 | (2019). Converted from the `pytorch` code developed by Collin 46 | Donahue-Oponski available at 47 | 48 | 49 | - `optim_adabelief()`: Adabelief optimizer proposed by Zhuang et al 50 | (2020). Converted from the authors’ PyTorch code: 51 | . 52 | 53 | - `optim_adabound()`: Adabound optimizer proposed by Luo et al.(2019). 54 | Converted from the authors’ PyTorch code: 55 | . 56 | 57 | - `optim_adahessian()`: Adahessian optimizer proposed by Yao et 58 | al.(2021). Converted from the authors’ PyTorch code: 59 | . 60 | 61 | - `optim_madgrad()`: Momentumized, Adaptive, Dual Averaged Gradient 62 | Method for Stochastic Optimization (MADGRAD) optimizer proposed by 63 | Defazio & Jelassi (2021). The function is imported from 64 | [madgrad](https://CRAN.R-project.org/package=madgrad) package and 65 | the source code is available at 66 | 67 | - `optim_nadam()`: Incorporation of Nesterov Momentum into Adam 68 | proposed by Dozat (2016). Converted from the PyTorch site 69 | . 70 | 71 | - `optim_qhadam()`: Quasi-hyperbolic version of Adam proposed by Ma 72 | and Yarats(2019). Converted from the code developed by Meta AI: 73 | . 74 | 75 | - `optim_radam()`: Rectified verison of Adam proposed by Liu et al. 76 | (2019). Converted from the PyTorch code 77 | . 78 | 79 | - `optim_swats()`: Optimizer that switches from Adam to SGD proposed 80 | by Keskar and Socher(2018). Converted from the `pytorch` code 81 | developed by Patrik Purgai: 82 | 83 | - `optim_yogi()`: Yogi optimizer proposed by Zaheer et al.(2019). 84 | Converted from the `pytorch` code developed by Nikolay Novik: 85 | 86 | 87 | ## Optimization test functions 88 | 89 | You can also test optimizers using optimization [test 90 | functions](https://en.wikipedia.org/wiki/Test_functions_for_optimization) 91 | provided by `torchopt` including `"ackley"`, `"beale"`, `"booth"`, 92 | `"bukin_n6"`, `"easom"`, `"goldstein_price"`, `"himmelblau"`, 93 | `"levi_n13"`, `"matyas"`, `"rastrigin"`, `"rosenbrock"`, `"sphere"`. 94 | Optimization functions are useful to evaluate characteristics of 95 | optimization algorithms, such as convergence rate, precision, 96 | robustness, and performance. These functions give an idea about the 97 | different situations that optimization algorithms can face. 98 | 99 | In what follows, we perform tests using `"beale"` test function. To 100 | visualize an animated GIF, we set `plot_each_step=TRUE` and capture each 101 | step frame using [gifski](https://CRAN.R-project.org/package=gifski) 102 | package. 103 | 104 | ### `optim_adamw()`: 105 | 106 | ``` r 107 | # test optim adamw 108 | set.seed(12345) 109 | torchopt::test_optim( 110 | optim = torchopt::optim_adamw, 111 | test_fn = "beale", 112 | opt_hparams = list(lr = 0.1), 113 | steps = 500, 114 | plot_each_step = TRUE 115 | ) 116 | ``` 117 | 118 | 119 | 120 | ### `optim_adabelief()`: 121 | 122 | ``` r 123 | set.seed(42) 124 | test_optim( 125 | optim = optim_adabelief, 126 | opt_hparams = list(lr = 0.5), 127 | steps = 400, 128 | test_fn = "beale", 129 | plot_each_step = TRUE 130 | ) 131 | ``` 132 | 133 | 134 | 135 | ### `optim_adabound()`: 136 | 137 | ``` r 138 | # set manual seed 139 | set.seed(22) 140 | test_optim( 141 | optim = optim_adabound, 142 | opt_hparams = list(lr = 0.5), 143 | steps = 400, 144 | test_fn = "beale", 145 | plot_each_step = TRUE 146 | ) 147 | ``` 148 | 149 | 150 | 151 | ### `optim_adahessian()`: 152 | 153 | ``` r 154 | # set manual seed 155 | set.seed(290356) 156 | test_optim( 157 | optim = optim_adahessian, 158 | opt_hparams = list(lr = 0.2), 159 | steps = 500, 160 | test_fn = "beale", 161 | plot_each_step = TRUE 162 | ) 163 | ``` 164 | 165 | 166 | 167 | ### `optim_madgrad()`: 168 | 169 | ``` r 170 | set.seed(256) 171 | test_optim( 172 | optim = optim_madgrad, 173 | opt_hparams = list(lr = 0.05), 174 | steps = 400, 175 | test_fn = "beale", 176 | plot_each_step = TRUE 177 | ) 178 | ``` 179 | 180 | 181 | 182 | ### `optim_nadam()`: 183 | 184 | ``` r 185 | set.seed(2903) 186 | test_optim( 187 | optim = optim_nadam, 188 | opt_hparams = list(lr = 0.5, weight_decay = 0), 189 | steps = 500, 190 | test_fn = "beale", 191 | plot_each_step = TRUE 192 | ) 193 | ``` 194 | 195 | 196 | 197 | ### `optim_qhadam()`: 198 | 199 | ``` r 200 | set.seed(1024) 201 | test_optim( 202 | optim = optim_qhadam, 203 | opt_hparams = list(lr = 0.1), 204 | steps = 500, 205 | test_fn = "beale", 206 | plot_each_step = TRUE 207 | ) 208 | ``` 209 | 210 | 211 | 212 | ### `optim_radam()`: 213 | 214 | ``` r 215 | set.seed(1024) 216 | test_optim( 217 | optim = optim_radam, 218 | opt_hparams = list(lr = 1.0), 219 | steps = 500, 220 | test_fn = "beale", 221 | plot_each_step = TRUE 222 | ) 223 | ``` 224 | 225 | 226 | 227 | ### `optim_swats()`: 228 | 229 | ``` r 230 | set.seed(234) 231 | test_optim( 232 | optim = optim_swats, 233 | opt_hparams = list(lr = 0.5), 234 | steps = 500, 235 | test_fn = "beale", 236 | plot_each_step = TRUE 237 | ) 238 | ``` 239 | 240 | 241 | 242 | ### `optim_yogi()`: 243 | 244 | ``` r 245 | # set manual seed 246 | set.seed(66) 247 | test_optim( 248 | optim = optim_yogi, 249 | opt_hparams = list(lr = 0.1), 250 | steps = 500, 251 | test_fn = "beale", 252 | plot_each_step = TRUE 253 | ) 254 | ``` 255 | 256 | 257 | 258 | ## Acknowledgements 259 | 260 | We are thankful to Collin Donahue-Oponski , 261 | Amir Gholami , Liangchen Luo 262 | , Liyuan Liu 263 | , Nikolay Novik 264 | , Patrik Purgai 265 | Juntang Zhuang 266 | and the PyTorch team 267 | for providing pytorch code for the 268 | optimizers implemented in this package. We also thank Daniel Falbel 269 | for providing support for the R version of 270 | PyTorch. 271 | 272 | ## Code of Conduct 273 | 274 | The torchopt project is released with a [Contributor Code of 275 | Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). 276 | By contributing to this project, you agree to abide by its terms. 277 | 278 | ## References 279 | 280 | - ADABELIEF: Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda, 281 | Nicha Dvornek, Xenophon Papademetris, James S. Duncan. “Adabelief 282 | Optimizer: Adapting Stepsizes by the Belief in Observed Gradients”, 283 | 34th Conference on Neural Information Processing Systems (NeurIPS 284 | 2020), . 285 | 286 | - ADABOUND: Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, “Adaptive 287 | Gradient Methods with Dynamic Bound of Learning Rate”, International 288 | Conference on Learning Representations (ICLR), 2019. 289 | . 290 | 291 | - ADAHESSIAN: Zhewei Yao, Amir Gholami, Sheng Shen, Mustafa Mustafa, 292 | Kurt Keutzer, Michael W. Mahoney. “Adahessian: An Adaptive Second 293 | Order Optimizer for Machine Learning”, AAAI Conference on Artificial 294 | Intelligence, 35(12), 10665-10673, 2021. 295 | . 296 | 297 | - ADAMW: Ilya Loshchilov, Frank Hutter, “Decoupled Weight Decay 298 | Regularization”, International Conference on Learning 299 | Representations (ICLR) 2019. 300 | . 301 | 302 | - MADGRAD: Aaron Defazio, Samy Jelassi, “Adaptivity without 303 | Compromise: A Momentumized, Adaptive, Dual Averaged Gradient Method 304 | for Stochastic Optimization”, arXiv preprint arXiv:2101.11075, 2021. 305 | 306 | 307 | - NADAM: Timothy Dazat, “Incorporating Nesterov Momentum into Adam”, 308 | International Conference on Learning Representations (ICLR), 2019. 309 | 310 | 311 | - QHADAM: Jerry Ma, Denis Yarats, “Quasi-hyperbolic momentum and Adam 312 | for deep learning”. 313 | 314 | - RADAM: Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, 315 | Xiaodong Liu, Jianfeng Gao, Jiawei Han, “On the Variance of the 316 | Adaptive Learning Rate and Beyond”, International Conference on 317 | Learning Representations (ICLR) 2020. 318 | . 319 | 320 | - SWATS: Nitish Keskar, Richard Socher, “Improving Generalization 321 | Performance by Switching from Adam to SGD”. International Conference 322 | on Learning Representations (ICLR), 2018. 323 | . 324 | 325 | - YOGI: Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, 326 | Sanjiv Kumar, “Adaptive Methods for Nonconvex Optimization”, 327 | Advances in Neural Information Processing Systems 31 (NeurIPS 2018). 328 | 329 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | adabelief 3 | Adabelief 4 | AdaBelief 5 | AdaBound 6 | Adabound 7 | adabound 8 | Adadelta 9 | Adagrad 10 | AdamW 11 | adamw 12 | Adaptivity 13 | authors’ 14 | Beale 15 | CMD 16 | Codecov 17 | Dazat 18 | Dozat 19 | Defazio 20 | Devendra 21 | Dvornek 22 | dfalbel 23 | Falbel 24 | Gao 25 | Golay 26 | Grosso 27 | Guestrin 28 | Guolin 29 | Haoming 30 | Hassan 31 | Herold 32 | Hijmans 33 | Hutter 34 | ICLR 35 | Idoumghar 36 | Ilya 37 | IJCNN 38 | IKI 39 | Ilya 40 | INPE 41 | ISPRS 42 | ISSN 43 | isprsjprs 44 | Jaccard 45 | Jianfeng 46 | Jiang 47 | Jiawei 48 | Jelassi 49 | jenks 50 | Jenks 51 | Jordao 52 | JSON 53 | JSTARS 54 | juntang 55 | Juntang 56 | Kaggle 57 | Kingma 58 | KDD 59 | Ke 60 | Kegelmeyer 61 | keras 62 | Keras 63 | kganz 64 | kmeans 65 | kohonen 66 | Kohonen 67 | Körner 68 | Korner 69 | Kruisselbrink 70 | Kumar 71 | Landrieu 72 | landsat 73 | latlong 74 | Lhassane 75 | LKP 76 | LLKP 77 | Keskar 78 | Liangchen 79 | LightGBM 80 | Liu 81 | Liyuan 82 | LiyuanLucasLiu 83 | Loic 84 | Loshchilov 85 | LSTM 86 | LTAE 87 | Lubia 88 | Luo 89 | Luolc 90 | LUCC 91 | luz 92 | LZW 93 | MADGRAD 94 | MLP 95 | madgrad 96 | maja 97 | mapview 98 | Maja 99 | Magrittr 100 | Manzil 101 | Mato 102 | Mattias 103 | Maus 104 | MODIS 105 | MSPC 106 | Maximage 107 | Meng 108 | Mohr 109 | mlr 110 | Momentumized 111 | Mrpatekful 112 | Nadam 113 | Nesrine 114 | nadam 115 | Nesterov 116 | Nesterov’s 117 | NeurIPS 118 | Nicha 119 | Nikolay 120 | Nitish 121 | Nonconvex 122 | Novik 123 | Oponski 124 | openreview 125 | Patrik 126 | Papademetris 127 | Pengcheng 128 | Purgai 129 | QH 130 | qhadam 131 | QHAdam 132 | qhoptim 133 | RAdam 134 | radam 135 | RMSProp 136 | RMSprop 137 | Reddi 138 | SGD 139 | Sachan 140 | Samy 141 | Sanjiv 142 | Sashank 143 | Satyen 144 | Sekhar 145 | Shekar 146 | Shirish 147 | Sochee 148 | Socher 149 | warmup 150 | Tatikonda 151 | Xiong 152 | Xiaodong 153 | Xu 154 | Weizhu 155 | Yan 156 | Yao 157 | Yarats 158 | Yifan 159 | Yuanhao 160 | Zaheer 161 | zhuang 162 | Zhuang 163 | ZJjtNEZ 164 | al 165 | arXiv 166 | arxiv 167 | bff 168 | colllin 169 | doi 170 | et 171 | facebookresearch 172 | gifski 173 | github 174 | gmail 175 | grDevices 176 | grey 177 | GTiff 178 | headtails 179 | hcl 180 | HCL 181 | hclust 182 | hotfix 183 | Hotfix 184 | href 185 | http 186 | https 187 | io 188 | ir 189 | inequivalence 190 | interpolator 191 | iteratively 192 | jettify 193 | labelled 194 | labelling 195 | licence 196 | lineshape 197 | lintr 198 | logref 199 | lon 200 | lr 201 | lubridate 202 | mem 203 | memsize 204 | metatype 205 | msg 206 | mth 207 | mlverse 208 | multiclass 209 | multilayer 210 | multinom 211 | MULTIPOLYGON 212 | Nacional 213 | NatNonForest 214 | neighbourhood 215 | neighbours 216 | NDVI 217 | ndvi 218 | ncols 219 | nonconvex 220 | optimizers 221 | Optimizers 222 | preprint 223 | py 224 | pytorch 225 | rescaled 226 | th 227 | verison 228 | viridis 229 | wikipedia 230 | -------------------------------------------------------------------------------- /man/figures/README-chunk-label-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-.gif -------------------------------------------------------------------------------- /man/figures/README-chunk-label-1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-1.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-10.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-10.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-2.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-3.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-4.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-5.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-5.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-6.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-6.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-7.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-7.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-8.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-8.jpeg -------------------------------------------------------------------------------- /man/figures/README-chunk-label-9.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-chunk-label-9.jpeg -------------------------------------------------------------------------------- /man/figures/README-gif_opt-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-gif_opt-.gif -------------------------------------------------------------------------------- /man/figures/README-opt_fun-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-opt_fun-1.png -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /man/figures/README-test_adabelief-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adabelief-.gif -------------------------------------------------------------------------------- /man/figures/README-test_adabound-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adabound-.gif -------------------------------------------------------------------------------- /man/figures/README-test_adahessian-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adahessian-.gif -------------------------------------------------------------------------------- /man/figures/README-test_adamw-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_adamw-.gif -------------------------------------------------------------------------------- /man/figures/README-test_madgrad-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_madgrad-.gif -------------------------------------------------------------------------------- /man/figures/README-test_nadam-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_nadam-.gif -------------------------------------------------------------------------------- /man/figures/README-test_qhadam-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_qhadam-.gif -------------------------------------------------------------------------------- /man/figures/README-test_radam-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_radam-.gif -------------------------------------------------------------------------------- /man/figures/README-test_swats-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_swats-.gif -------------------------------------------------------------------------------- /man/figures/README-test_yogi-.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/e-sensing/torchopt/399f27b52ac09105ed4b1b1729ac76db73987d0d/man/figures/README-test_yogi-.gif -------------------------------------------------------------------------------- /man/optim_adabelief.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adabelief.R 3 | \name{optim_adabelief} 4 | \alias{optim_adabelief} 5 | \title{Adabelief optimizer} 6 | \usage{ 7 | optim_adabelief( 8 | params, 9 | lr = 0.001, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-08, 12 | weight_decay = 1e-06, 13 | weight_decouple = TRUE, 14 | fixed_decay = FALSE, 15 | rectify = TRUE 16 | ) 17 | } 18 | \arguments{ 19 | \item{params}{List of parameters to optimize.} 20 | 21 | \item{lr}{Learning rate (default: 1e-3)} 22 | 23 | \item{betas}{Coefficients for computing running averages 24 | of gradient and its square (default: (0.9, 0.999))} 25 | 26 | \item{eps}{Term added to the denominator to improve numerical 27 | stability (default: 1e-16)} 28 | 29 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)} 30 | 31 | \item{weight_decouple}{Use decoupled weight decay as is done in AdamW?} 32 | 33 | \item{fixed_decay}{This is used when weight_decouple is set as True. 34 | When fixed_decay == True, weight decay is 35 | W_new = W_old - W_old * decay. 36 | When fixed_decay == False, the weight decay is 37 | W_new = W_old - W_old * decay * learning_rate. 38 | In this case, weight decay decreases with learning rate.} 39 | 40 | \item{rectify}{Perform the rectified update similar to RAdam?} 41 | } 42 | \value{ 43 | A torch optimizer object implementing the \code{step} method. 44 | } 45 | \description{ 46 | R implementation of the adabelief optimizer proposed 47 | by Zhuang et al (2020). We used the pytorch implementation 48 | developed by the authors which is available at 49 | https://github.com/jettify/pytorch-optimizer. 50 | Thanks to Nikolay Novik of his work on python optimizers. 51 | 52 | The original implementation is licensed using the Apache-2.0 software license. 53 | This implementation is also licensed using Apache-2.0 license. 54 | 55 | From the abstract by the paper by Zhuang et al (2021): 56 | We propose Adabelief to simultaneously achieve three goals: 57 | fast convergence as in adaptive methods, good generalization as in SGD, 58 | and training stability. The intuition for AdaBelief is to adapt 59 | the stepsize according to the "belief" in the current gradient direction. 60 | Viewing the exponential moving average of the noisy gradient 61 | as the prediction of the gradient at the next time step, 62 | if the observed gradient greatly deviates from the prediction, 63 | we distrust the current observation and take a small step; 64 | if the observed gradient is close to the prediction, 65 | we trust it and take a large step. 66 | } 67 | \examples{ 68 | if (torch::torch_is_installed()) { 69 | # function to demonstrate optimization 70 | beale <- function(x, y) { 71 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 72 | } 73 | # define optimizer 74 | optim <- torchopt::optim_adabelief 75 | # define hyperparams 76 | opt_hparams <- list(lr = 0.01) 77 | 78 | # starting point 79 | x0 <- 3 80 | y0 <- 3 81 | # create tensor 82 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 83 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 84 | # instantiate optimizer 85 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 86 | # run optimizer 87 | steps <- 400 88 | x_steps <- numeric(steps) 89 | y_steps <- numeric(steps) 90 | for (i in seq_len(steps)) { 91 | x_steps[i] <- as.numeric(x) 92 | y_steps[i] <- as.numeric(y) 93 | optim$zero_grad() 94 | z <- beale(x, y) 95 | z$backward() 96 | optim$step() 97 | } 98 | print(paste0("starting value = ", beale(x0, y0))) 99 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 100 | } 101 | } 102 | \references{ 103 | Juntang Zhuang, Tommy Tang, Yifan Ding, Sekhar Tatikonda, 104 | Nicha Dvornek, Xenophon Papademetris, James S. Duncan. 105 | "Adabelief Optimizer: Adapting Stepsizes by the Belief in Observed Gradients", 106 | 34th Conference on Neural Information Processing Systems (NeurIPS 2020), 107 | Vancouver, Canada. 108 | https://arxiv.org/abs/2010.07468 109 | } 110 | \author{ 111 | Gilberto Camara, \email{gilberto.camara@inpe.br} 112 | 113 | Rolf Simoes, \email{rolf.simoes@inpe.br} 114 | 115 | Felipe Souza, \email{lipecaso@gmail.com} 116 | 117 | Alber Sanchez, \email{alber.ipia@inpe.br} 118 | } 119 | -------------------------------------------------------------------------------- /man/optim_adabound.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adabound.R 3 | \name{optim_adabound} 4 | \alias{optim_adabound} 5 | \title{Adabound optimizer} 6 | \usage{ 7 | optim_adabound( 8 | params, 9 | lr = 0.001, 10 | betas = c(0.9, 0.999), 11 | final_lr = 0.1, 12 | gamma = 0.001, 13 | eps = 1e-08, 14 | weight_decay = 0 15 | ) 16 | } 17 | \arguments{ 18 | \item{params}{List of parameters to optimize.} 19 | 20 | \item{lr}{Learning rate (default: 1e-3)} 21 | 22 | \item{betas}{Coefficients computing running averages of gradient 23 | and its square (default: (0.9, 0.999))} 24 | 25 | \item{final_lr}{Final (SGD) learning rate (default: 0.1)} 26 | 27 | \item{gamma}{Convergence speed of the bound functions 28 | (default: 1e-3)} 29 | 30 | \item{eps}{Term added to the denominator to improve numerical 31 | stability (default: 1e-8)} 32 | 33 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)} 34 | } 35 | \value{ 36 | A torch optimizer object implementing the \code{step} method. 37 | } 38 | \description{ 39 | R implementation of the AdaBound optimizer proposed 40 | by Luo et al.(2019). We used the implementation available at 41 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py. 42 | Thanks to Nikolay Novik for providing the pytorch code. 43 | 44 | The original implementation is licensed using the Apache-2.0 software license. 45 | This implementation is also licensed using Apache-2.0 license. 46 | 47 | AdaBound is a variant of the Adam stochastic optimizer which is 48 | designed to be more robust to extreme learning rates. 49 | Dynamic bounds are employed on learning rates, 50 | where the lower and upper bound are initialized as zero and 51 | infinity respectively, and they both smoothly converge to a 52 | constant final step size. AdaBound can be regarded as an adaptive 53 | method at the beginning of training, and thereafter it gradually and 54 | smoothly transforms to SGD (or with momentum) as the time step increases. 55 | } 56 | \examples{ 57 | if (torch::torch_is_installed()) { 58 | # function to demonstrate optimization 59 | beale <- function(x, y) { 60 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 61 | } 62 | # define optimizer 63 | optim <- torchopt::optim_adabound 64 | # define hyperparams 65 | opt_hparams <- list(lr = 0.01) 66 | 67 | # starting point 68 | x0 <- 3 69 | y0 <- 3 70 | # create tensor 71 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 72 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 73 | # instantiate optimizer 74 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 75 | # run optimizer 76 | steps <- 400 77 | x_steps <- numeric(steps) 78 | y_steps <- numeric(steps) 79 | for (i in seq_len(steps)) { 80 | x_steps[i] <- as.numeric(x) 81 | y_steps[i] <- as.numeric(y) 82 | optim$zero_grad() 83 | z <- beale(x, y) 84 | z$backward() 85 | optim$step() 86 | } 87 | print(paste0("starting value = ", beale(x0, y0))) 88 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 89 | } 90 | } 91 | \references{ 92 | Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun, 93 | "Adaptive Gradient Methods with Dynamic Bound of Learning Rate", 94 | International Conference on Learning Representations (ICLR), 2019. 95 | https://arxiv.org/abs/1902.09843 96 | } 97 | \author{ 98 | Rolf Simoes, \email{rolf.simoes@inpe.br} 99 | 100 | Felipe Souza, \email{lipecaso@gmail.com} 101 | 102 | Alber Sanchez, \email{alber.ipia@inpe.br} 103 | 104 | Gilberto Camara, \email{gilberto.camara@inpe.br} 105 | } 106 | -------------------------------------------------------------------------------- /man/optim_adahessian.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adahessian.R 3 | \name{optim_adahessian} 4 | \alias{optim_adahessian} 5 | \title{Adahessian optimizer} 6 | \usage{ 7 | optim_adahessian( 8 | params, 9 | lr = 0.15, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-04, 12 | weight_decay = 0, 13 | hessian_power = 0.5 14 | ) 15 | } 16 | \arguments{ 17 | \item{params}{Iterable of parameters to optimize.} 18 | 19 | \item{lr}{Learning rate (default: 0.15).} 20 | 21 | \item{betas}{Coefficients for computing 22 | running averages of gradient 23 | and is square(default: (0.9, 0.999)).} 24 | 25 | \item{eps}{Term added to the denominator to improve 26 | numerical stability (default: 1e-4).} 27 | 28 | \item{weight_decay}{L2 penalty (default: 0).} 29 | 30 | \item{hessian_power}{Hessian power (default: 1.0).} 31 | } 32 | \value{ 33 | An optimizer object implementing the \code{step} and \code{zero_grad} methods. 34 | } 35 | \description{ 36 | R implementation of the Adahessian optimizer proposed 37 | by Yao et al.(2020). The original implementation is available at 38 | https://github.com/amirgholami/adahessian. 39 | } 40 | \references{ 41 | Yao, Z., Gholami, A., Shen, S., Mustafa, M., Keutzer, K., 42 | & Mahoney, M. (2021). 43 | ADAHESSIAN: An Adaptive Second Order Optimizer for Machine Learning. 44 | Proceedings of the AAAI Conference on Artificial Intelligence, 35(12), 45 | 10665-10673. 46 | https://arxiv.org/abs/2006.00719 47 | } 48 | \author{ 49 | Rolf Simoes, \email{rolf.simoes@inpe.br} 50 | 51 | Felipe Souza, \email{lipecaso@gmail.com} 52 | 53 | Alber Sanchez, \email{alber.ipia@inpe.br} 54 | 55 | Gilberto Camara, \email{gilberto.camara@inpe.br} 56 | } 57 | -------------------------------------------------------------------------------- /man/optim_adamw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adamw.R 3 | \name{optim_adamw} 4 | \alias{optim_adamw} 5 | \title{AdamW optimizer} 6 | \usage{ 7 | optim_adamw( 8 | params, 9 | lr = 0.01, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-08, 12 | weight_decay = 1e-06 13 | ) 14 | } 15 | \arguments{ 16 | \item{params}{List of parameters to optimize.} 17 | 18 | \item{lr}{Learning rate (default: 1e-3)} 19 | 20 | \item{betas}{Coefficients computing running averages of gradient 21 | and its square (default: (0.9, 0.999))} 22 | 23 | \item{eps}{Term added to the denominator to improve numerical 24 | stability (default: 1e-8)} 25 | 26 | \item{weight_decay}{Weight decay (L2 penalty) (default: 1e-6)} 27 | } 28 | \value{ 29 | A torch optimizer object implementing the \code{step} method. 30 | } 31 | \description{ 32 | R implementation of the AdamW optimizer proposed 33 | by Loshchilov & Hutter (2019). We used the pytorch implementation 34 | developed by Collin Donahue-Oponski available at: 35 | https://gist.github.com/colllin/0b146b154c4351f9a40f741a28bff1e3 36 | 37 | From the abstract by the paper by Loshchilov & Hutter (2019): 38 | L2 regularization and weight decay regularization are equivalent for standard 39 | stochastic gradient descent (when rescaled by the learning rate), 40 | but as we demonstrate this is not the case for adaptive gradient algorithms, 41 | such as Adam. While common implementations of these algorithms 42 | employ L2 regularization (often calling it “weight decay” 43 | in what may be misleading due to the inequivalence we expose), 44 | we propose a simple modification to recover the original formulation of 45 | weight decay regularization by decoupling the weight decay from the optimization 46 | steps taken w.r.t. the loss function 47 | } 48 | \examples{ 49 | if (torch::torch_is_installed()) { 50 | # function to demonstrate optimization 51 | beale <- function(x, y) { 52 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 53 | } 54 | # define optimizer 55 | optim <- torchopt::optim_adamw 56 | # define hyperparams 57 | opt_hparams <- list(lr = 0.01) 58 | 59 | # starting point 60 | x0 <- 3 61 | y0 <- 3 62 | # create tensor 63 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 64 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 65 | # instantiate optimizer 66 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 67 | # run optimizer 68 | steps <- 400 69 | x_steps <- numeric(steps) 70 | y_steps <- numeric(steps) 71 | for (i in seq_len(steps)) { 72 | x_steps[i] <- as.numeric(x) 73 | y_steps[i] <- as.numeric(y) 74 | optim$zero_grad() 75 | z <- beale(x, y) 76 | z$backward() 77 | optim$step() 78 | } 79 | print(paste0("starting value = ", beale(x0, y0))) 80 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 81 | } 82 | } 83 | \references{ 84 | Ilya Loshchilov, Frank Hutter, 85 | "Decoupled Weight Decay Regularization", 86 | International Conference on Learning Representations (ICLR) 2019. 87 | https://arxiv.org/abs/1711.05101 88 | } 89 | \author{ 90 | Gilberto Camara, \email{gilberto.camara@inpe.br} 91 | 92 | Rolf Simoes, \email{rolf.simoes@inpe.br} 93 | 94 | Felipe Souza, \email{lipecaso@gmail.com} 95 | 96 | Alber Sanchez, \email{alber.ipia@inpe.br} 97 | } 98 | -------------------------------------------------------------------------------- /man/optim_madgrad.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/madgrad.R 3 | \name{optim_madgrad} 4 | \alias{optim_madgrad} 5 | \title{MADGRAD optimizer} 6 | \usage{ 7 | optim_madgrad(params, lr = 0.01, momentum = 0.9, weight_decay = 0, eps = 1e-06) 8 | } 9 | \arguments{ 10 | \item{params}{List of parameters to optimize.} 11 | 12 | \item{lr}{Learning rate (default: 1e-2).} 13 | 14 | \item{momentum}{Momentum value in the range [0,1) (default: 0.9).} 15 | 16 | \item{weight_decay}{Weight decay, i.e. a L2 penalty (default: 0).} 17 | 18 | \item{eps}{Term added to the denominator outside of 19 | the root operation to improve numerical stability 20 | (default: 1e-6).} 21 | } 22 | \value{ 23 | A torch optimizer object implementing the \code{step} method. 24 | } 25 | \description{ 26 | A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic 27 | Optimization (MADGRAD) is a general purpose optimizer that 28 | can be used in place of SGD or Adam may converge faster and generalize 29 | better. Currently GPU-only. Typically, the same learning rate schedule 30 | that is used for SGD or Adam may be used. The overall learning rate is 31 | not comparable to either method and should be determined by a 32 | hyper-parameter sweep. 33 | 34 | MADGRAD requires less weight decay than other methods, often as little as 35 | zero. Momentum values used for SGD or Adam's beta1 should work here also. 36 | 37 | On sparse problems both weight_decay and momentum should be set to 0. 38 | (not yet supported in the R implementation). 39 | } 40 | \examples{ 41 | if (torch::torch_is_installed()) { 42 | # function to demonstrate optimization 43 | beale <- function(x, y) { 44 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 45 | } 46 | # define optimizer 47 | optim <- torchopt::optim_madgrad 48 | # define hyperparams 49 | opt_hparams <- list(lr = 0.01) 50 | 51 | # starting point 52 | x0 <- 3 53 | y0 <- 3 54 | # create tensor 55 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 56 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 57 | # instantiate optimizer 58 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 59 | # run optimizer 60 | steps <- 400 61 | x_steps <- numeric(steps) 62 | y_steps <- numeric(steps) 63 | for (i in seq_len(steps)) { 64 | x_steps[i] <- as.numeric(x) 65 | y_steps[i] <- as.numeric(y) 66 | optim$zero_grad() 67 | z <- beale(x, y) 68 | z$backward() 69 | optim$step() 70 | } 71 | print(paste0("starting value = ", beale(x0, y0))) 72 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 73 | } 74 | } 75 | \references{ 76 | Aaron Defazio, Samy Jelassi, 77 | "Adaptivity without Compromise: A Momentumized, Adaptive, Dual 78 | Averaged Gradient Method for Stochastic Optimization". 79 | https://arxiv.org/abs/2101.11075 80 | } 81 | \author{ 82 | Daniel Falbel, \email{dfalbel@gmail.com} 83 | } 84 | -------------------------------------------------------------------------------- /man/optim_nadam.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nadam.R 3 | \name{optim_nadam} 4 | \alias{optim_nadam} 5 | \title{Nadam optimizer} 6 | \usage{ 7 | optim_nadam( 8 | params, 9 | lr = 0.002, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-08, 12 | weight_decay = 0, 13 | momentum_decay = 0.004 14 | ) 15 | } 16 | \arguments{ 17 | \item{params}{List of parameters to optimize.} 18 | 19 | \item{lr}{Learning rate (default: 1e-3)} 20 | 21 | \item{betas}{Coefficients computing running averages of gradient 22 | and its square (default: (0.9, 0.999)).} 23 | 24 | \item{eps}{Term added to the denominator to improve numerical 25 | stability (default: 1e-8).} 26 | 27 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0).} 28 | 29 | \item{momentum_decay}{Momentum_decay (default: 4e-3).} 30 | } 31 | \value{ 32 | A torch optimizer object implementing the \code{step} method. 33 | } 34 | \description{ 35 | R implementation of the Nadam optimizer proposed 36 | by Dazat (2016). 37 | 38 | From the abstract by the paper by Dozat (2016): 39 | This work aims to improve upon the recently proposed and 40 | rapidly popularized optimization algorithm Adam (Kingma & Ba, 2014). 41 | Adam has two main components—a momentum component and an adaptive 42 | learning rate component. However, regular momentum can be shown conceptually 43 | and empirically to be inferior to a similar algorithm known as 44 | Nesterov’s accelerated gradient (NAG). 45 | } 46 | \examples{ 47 | if (torch::torch_is_installed()) { 48 | # function to demonstrate optimization 49 | beale <- function(x, y) { 50 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 51 | } 52 | # define optimizer 53 | optim <- torchopt::optim_nadam 54 | # define hyperparams 55 | opt_hparams <- list(lr = 0.01) 56 | 57 | # starting point 58 | x0 <- 3 59 | y0 <- 3 60 | # create tensor 61 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 62 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 63 | # instantiate optimizer 64 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 65 | # run optimizer 66 | steps <- 400 67 | x_steps <- numeric(steps) 68 | y_steps <- numeric(steps) 69 | for (i in seq_len(steps)) { 70 | x_steps[i] <- as.numeric(x) 71 | y_steps[i] <- as.numeric(y) 72 | optim$zero_grad() 73 | z <- beale(x, y) 74 | z$backward() 75 | optim$step() 76 | } 77 | print(paste0("starting value = ", beale(x0, y0))) 78 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 79 | } 80 | } 81 | \references{ 82 | Timothy Dozat, 83 | "Incorporating Nesterov Momentum into Adam", 84 | International Conference on Learning Representations (ICLR) 2016. 85 | https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf 86 | } 87 | \author{ 88 | Gilberto Camara, \email{gilberto.camara@inpe.br} 89 | 90 | Rolf Simoes, \email{rolf.simoes@inpe.br} 91 | 92 | Felipe Souza, \email{lipecaso@gmail.com} 93 | 94 | Alber Sanchez, \email{alber.ipia@inpe.br} 95 | } 96 | -------------------------------------------------------------------------------- /man/optim_qhadam.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qhadam.R 3 | \name{optim_qhadam} 4 | \alias{optim_qhadam} 5 | \title{QHAdam optimization algorithm} 6 | \usage{ 7 | optim_qhadam( 8 | params, 9 | lr = 0.01, 10 | betas = c(0.9, 0.999), 11 | eps = 0.001, 12 | nus = c(1, 1), 13 | weight_decay = 0, 14 | decouple_weight_decay = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{params}{List of parameters to optimize.} 19 | 20 | \item{lr}{Learning rate (default: 1e-3)} 21 | 22 | \item{betas}{Coefficients computing running averages of gradient 23 | and its square (default: (0.9, 0.999))} 24 | 25 | \item{eps}{Term added to the denominator to improve numerical 26 | stability (default: 1e-8)} 27 | 28 | \item{nus}{Immediate discount factors used to 29 | estimate the gradient and its square 30 | (default: (1.0, 1.0))} 31 | 32 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)} 33 | 34 | \item{decouple_weight_decay}{Whether to decouple the weight 35 | decay from the gradient-based optimization step.} 36 | } 37 | \value{ 38 | A torch optimizer object implementing the \code{step} method. 39 | } 40 | \description{ 41 | R implementation of the QHAdam optimizer proposed 42 | by Ma and Yarats(2019). We used the implementation available at 43 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/qhadam.py. 44 | Thanks to Nikolay Novik for providing the pytorch code. 45 | 46 | The original implementation has been developed by Facebook AI 47 | and is licensed using the MIT license. 48 | 49 | From the the paper by Ma and Yarats(2019): 50 | QHAdam is a QH augmented version of Adam, where we 51 | replace both of Adam's moment estimators with quasi-hyperbolic terms. 52 | QHAdam decouples the momentum term from the current gradient when 53 | updating the weights, and decouples the mean squared gradients 54 | term from the current squared gradient when updating the weights. 55 | } 56 | \examples{ 57 | if (torch::torch_is_installed()) { 58 | # function to demonstrate optimization 59 | beale <- function(x, y) { 60 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 61 | } 62 | # define optimizer 63 | optim <- torchopt::optim_qhadam 64 | # define hyperparams 65 | opt_hparams <- list(lr = 0.01) 66 | 67 | # starting point 68 | x0 <- 3 69 | y0 <- 3 70 | # create tensor 71 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 72 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 73 | # instantiate optimizer 74 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 75 | # run optimizer 76 | steps <- 400 77 | x_steps <- numeric(steps) 78 | y_steps <- numeric(steps) 79 | for (i in seq_len(steps)) { 80 | x_steps[i] <- as.numeric(x) 81 | y_steps[i] <- as.numeric(y) 82 | optim$zero_grad() 83 | z <- beale(x, y) 84 | z$backward() 85 | optim$step() 86 | } 87 | print(paste0("starting value = ", beale(x0, y0))) 88 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 89 | } 90 | 91 | } 92 | \references{ 93 | Jerry Ma, Denis Yarats, 94 | "Quasi-hyperbolic momentum and Adam for deep learning". 95 | https://arxiv.org/abs/1810.06801 96 | } 97 | \author{ 98 | Gilberto Camara, \email{gilberto.camara@inpe.br} 99 | 100 | Daniel Falbel, \email{daniel.falble@gmail.com} 101 | 102 | Rolf Simoes, \email{rolf.simoes@inpe.br} 103 | 104 | Felipe Souza, \email{lipecaso@gmail.com} 105 | 106 | Alber Sanchez, \email{alber.ipia@inpe.br} 107 | } 108 | -------------------------------------------------------------------------------- /man/optim_radam.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/radam.R 3 | \name{optim_radam} 4 | \alias{optim_radam} 5 | \title{AdamW optimizer} 6 | \usage{ 7 | optim_radam( 8 | params, 9 | lr = 0.01, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-08, 12 | weight_decay = 0 13 | ) 14 | } 15 | \arguments{ 16 | \item{params}{List of parameters to optimize.} 17 | 18 | \item{lr}{Learning rate (default: 1e-3)} 19 | 20 | \item{betas}{Coefficients computing running averages of gradient 21 | and its square (default: (0.9, 0.999))} 22 | 23 | \item{eps}{Term added to the denominator to improve numerical 24 | stability (default: 1e-8)} 25 | 26 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)} 27 | } 28 | \value{ 29 | A torch optimizer object implementing the \code{step} method. 30 | } 31 | \description{ 32 | R implementation of the RAdam optimizer proposed 33 | by Liu et al. (2019). 34 | We used the implementation in PyTorch as a basis for our 35 | implementation. 36 | 37 | From the abstract by the paper by Liu et al. (2019): 38 | The learning rate warmup heuristic achieves remarkable success 39 | in stabilizing training, accelerating convergence and improving 40 | generalization for adaptive stochastic optimization algorithms 41 | like RMSprop and Adam. Here, we study its mechanism in details. 42 | Pursuing the theory behind warmup, we identify a problem of the 43 | adaptive learning rate (i.e., it has problematically large variance 44 | in the early stage), suggest warmup works as a variance reduction 45 | technique, and provide both empirical and theoretical evidence to verify 46 | our hypothesis. We further propose RAdam, a new variant of Adam, 47 | by introducing a term to rectify the variance of the adaptive learning rate. 48 | Extensive experimental results on image classification, language modeling, 49 | and neural machine translation verify our intuition and demonstrate 50 | the effectiveness and robustness of our proposed method. 51 | } 52 | \examples{ 53 | if (torch::torch_is_installed()) { 54 | # function to demonstrate optimization 55 | beale <- function(x, y) { 56 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 57 | } 58 | # define optimizer 59 | optim <- torchopt::optim_radam 60 | # define hyperparams 61 | opt_hparams <- list(lr = 0.01) 62 | 63 | # starting point 64 | x0 <- 3 65 | y0 <- 3 66 | # create tensor 67 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 68 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 69 | # instantiate optimizer 70 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 71 | # run optimizer 72 | steps <- 400 73 | x_steps <- numeric(steps) 74 | y_steps <- numeric(steps) 75 | for (i in seq_len(steps)) { 76 | x_steps[i] <- as.numeric(x) 77 | y_steps[i] <- as.numeric(y) 78 | optim$zero_grad() 79 | z <- beale(x, y) 80 | z$backward() 81 | optim$step() 82 | } 83 | print(paste0("starting value = ", beale(x0, y0))) 84 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 85 | } 86 | } 87 | \references{ 88 | Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, 89 | Xiaodong Liu, Jianfeng Gao, Jiawei Han, 90 | "On the Variance of the Adaptive Learning Rate and Beyond", 91 | International Conference on Learning Representations (ICLR) 2020. 92 | https://arxiv.org/abs/1908.03265 93 | } 94 | \author{ 95 | Gilberto Camara, \email{gilberto.camara@inpe.br} 96 | 97 | Daniel Falbel, \email{daniel.falble@gmail.com} 98 | 99 | Rolf Simoes, \email{rolf.simoes@inpe.br} 100 | 101 | Felipe Souza, \email{lipecaso@gmail.com} 102 | 103 | Alber Sanchez, \email{alber.ipia@inpe.br} 104 | } 105 | -------------------------------------------------------------------------------- /man/optim_swats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/swats.R 3 | \name{optim_swats} 4 | \alias{optim_swats} 5 | \title{SWATS optimizer} 6 | \usage{ 7 | optim_swats( 8 | params, 9 | lr = 0.01, 10 | betas = c(0.9, 0.999), 11 | eps = 1e-08, 12 | weight_decay = 0, 13 | nesterov = FALSE 14 | ) 15 | } 16 | \arguments{ 17 | \item{params}{List of parameters to optimize.} 18 | 19 | \item{lr}{Learning rate (default: 1e-3)} 20 | 21 | \item{betas}{Coefficients computing running averages of gradient 22 | and its square (default: (0.9, 0.999)).} 23 | 24 | \item{eps}{Term added to the denominator to improve numerical 25 | stability (default: 1e-8).} 26 | 27 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0).} 28 | 29 | \item{nesterov}{Enables Nesterov momentum (default: False).} 30 | } 31 | \value{ 32 | A torch optimizer object implementing the \code{step} method. 33 | } 34 | \description{ 35 | R implementation of the SWATS optimizer proposed 36 | by Shekar and Sochee (2018). 37 | We used the implementation available at 38 | https://github.com/jettify/pytorch-optimizer/ 39 | Thanks to Nikolay Novik for providing the pytorch code. 40 | 41 | From the abstract by the paper by Shekar and Sochee (2018): 42 | Adaptive optimization methods such as Adam, Adagrad or RMSprop 43 | have been found to generalize poorly compared to 44 | Stochastic gradient descent (SGD). These methods tend to perform well i 45 | in the initial portion of training but are outperformed by SGD at 46 | later stages of training. We investigate a hybrid strategy that begins 47 | training with an adaptive method and switches to SGD 48 | when a triggering condition is satisfied. 49 | The condition we propose relates to the projection of Adam 50 | steps on the gradient subspace. By design, the monitoring process 51 | for this condition adds very little overhead and does not increase 52 | the number of hyperparameters in the optimizer. 53 | } 54 | \examples{ 55 | if (torch::torch_is_installed()) { 56 | # function to demonstrate optimization 57 | beale <- function(x, y) { 58 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 59 | } 60 | # define optimizer 61 | optim <- torchopt::optim_swats 62 | # define hyperparams 63 | opt_hparams <- list(lr = 0.01) 64 | 65 | # starting point 66 | x0 <- 3 67 | y0 <- 3 68 | # create tensor 69 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 70 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 71 | # instantiate optimizer 72 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 73 | # run optimizer 74 | steps <- 400 75 | x_steps <- numeric(steps) 76 | y_steps <- numeric(steps) 77 | for (i in seq_len(steps)) { 78 | x_steps[i] <- as.numeric(x) 79 | y_steps[i] <- as.numeric(y) 80 | optim$zero_grad() 81 | z <- beale(x, y) 82 | z$backward() 83 | optim$step() 84 | } 85 | print(paste0("starting value = ", beale(x0, y0))) 86 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 87 | } 88 | } 89 | \references{ 90 | Nitish Shirish Keskar, Richard Socher 91 | "Improving Generalization Performance by Switching from Adam to SGD". 92 | International Conference on Learning Representations (ICLR) 2018. 93 | https://arxiv.org/abs/1712.07628 94 | } 95 | \author{ 96 | Gilberto Camara, \email{gilberto.camara@inpe.br} 97 | 98 | Daniel Falbel, \email{daniel.falble@gmail.com} 99 | 100 | Rolf Simoes, \email{rolf.simoes@inpe.br} 101 | 102 | Felipe Souza, \email{lipecaso@gmail.com} 103 | 104 | Alber Sanchez, \email{alber.ipia@inpe.br} 105 | } 106 | -------------------------------------------------------------------------------- /man/optim_yogi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/yogi.R 3 | \name{optim_yogi} 4 | \alias{optim_yogi} 5 | \title{Yogi optimizer} 6 | \usage{ 7 | optim_yogi( 8 | params, 9 | lr = 0.01, 10 | betas = c(0.9, 0.999), 11 | eps = 0.001, 12 | initial_accumulator = 1e-06, 13 | weight_decay = 0 14 | ) 15 | } 16 | \arguments{ 17 | \item{params}{List of parameters to optimize.} 18 | 19 | \item{lr}{Learning rate (default: 1e-3)} 20 | 21 | \item{betas}{Coefficients computing running averages of gradient 22 | and its square (default: (0.9, 0.999))} 23 | 24 | \item{eps}{Term added to the denominator to improve numerical 25 | stability (default: 1e-8)} 26 | 27 | \item{initial_accumulator}{Initial values for first and 28 | second moments.} 29 | 30 | \item{weight_decay}{Weight decay (L2 penalty) (default: 0)} 31 | } 32 | \value{ 33 | A torch optimizer object implementing the \code{step} method. 34 | } 35 | \description{ 36 | R implementation of the Yogi optimizer proposed 37 | by Zaheer et al.(2019). We used the implementation available at 38 | https://github.com/jettify/pytorch-optimizer/blob/master/torch_optimizer/yogi.py. 39 | Thanks to Nikolay Novik for providing the pytorch code. 40 | 41 | The original implementation is licensed using the Apache-2.0 software license. 42 | This implementation is also licensed using Apache-2.0 license. 43 | 44 | From the abstract by the paper by Zaheer et al.(2019): 45 | Adaptive gradient methods that rely on scaling gradients 46 | down by the square root of exponential moving averages 47 | of past squared gradients, such RMSProp, Adam, Adadelta have 48 | found wide application in optimizing the nonconvex problems 49 | that arise in deep learning. However, it has been recently 50 | demonstrated that such methods can fail to converge even 51 | in simple convex optimization settings. 52 | Yogi is a new adaptive optimization algorithm, 53 | which controls the increase in effective learning rate, 54 | leading to even better performance with similar theoretical 55 | guarantees on convergence. Extensive experiments show that 56 | Yogi with very little hyperparameter tuning outperforms 57 | methods such as Adam in several challenging machine learning tasks. 58 | } 59 | \examples{ 60 | if (torch::torch_is_installed()) { 61 | # function to demonstrate optimization 62 | beale <- function(x, y) { 63 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 64 | } 65 | # define optimizer 66 | optim <- torchopt::optim_yogi 67 | # define hyperparams 68 | opt_hparams <- list(lr = 0.01) 69 | 70 | # starting point 71 | x0 <- 3 72 | y0 <- 3 73 | # create tensor 74 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 75 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 76 | # instantiate optimizer 77 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 78 | # run optimizer 79 | steps <- 400 80 | x_steps <- numeric(steps) 81 | y_steps <- numeric(steps) 82 | for (i in seq_len(steps)) { 83 | x_steps[i] <- as.numeric(x) 84 | y_steps[i] <- as.numeric(y) 85 | optim$zero_grad() 86 | z <- beale(x, y) 87 | z$backward() 88 | optim$step() 89 | } 90 | print(paste0("starting value = ", beale(x0, y0))) 91 | print(paste0("final value = ", beale(x_steps[steps], y_steps[steps]))) 92 | } 93 | } 94 | \references{ 95 | Manzil Zaheer, Sashank Reddi, Devendra Sachan, Satyen Kale, Sanjiv Kumar, 96 | "Adaptive Methods for Nonconvex Optimization", 97 | Advances in Neural Information Processing Systems 31 (NeurIPS 2018). 98 | https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization 99 | } 100 | \author{ 101 | Gilberto Camara, \email{gilberto.camara@inpe.br} 102 | 103 | Rolf Simoes, \email{rolf.simoes@inpe.br} 104 | 105 | Felipe Souza, \email{lipecaso@gmail.com} 106 | 107 | Alber Sanchez, \email{alber.ipia@inpe.br} 108 | } 109 | -------------------------------------------------------------------------------- /man/state-set.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-state.R 3 | \name{state<-} 4 | \alias{state<-} 5 | \title{Imported function} 6 | \usage{ 7 | state(self) <- value 8 | } 9 | \description{ 10 | Code lifted from a internal function of madgrad package. 11 | Set 'state' attribute of an object. 12 | } 13 | \author{ 14 | Daniel Falbel, \email{dfalbel@gmail.com} 15 | } 16 | \keyword{internal} 17 | -------------------------------------------------------------------------------- /man/state.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-state.R 3 | \name{state} 4 | \alias{state} 5 | \title{Imported function} 6 | \usage{ 7 | state(self) 8 | } 9 | \description{ 10 | Code lifted from a internal function of madgrad package. 11 | Get 'state' attribute of an object. 12 | } 13 | \author{ 14 | Daniel Falbel, \email{dfalbel@gmail.com} 15 | } 16 | \keyword{internal} 17 | -------------------------------------------------------------------------------- /man/test_optim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/utils-testopt.R 3 | \name{test_optim} 4 | \alias{test_optim} 5 | \title{Test optimization function} 6 | \usage{ 7 | test_optim( 8 | optim, 9 | ..., 10 | opt_hparams = list(), 11 | test_fn = "beale", 12 | steps = 200, 13 | pt_start_color = "#5050FF7F", 14 | pt_end_color = "#FF5050FF", 15 | ln_color = "#FF0000FF", 16 | ln_weight = 2, 17 | bg_xy_breaks = 100, 18 | bg_z_breaks = 32, 19 | bg_palette = "viridis", 20 | ct_levels = 10, 21 | ct_labels = FALSE, 22 | ct_color = "#FFFFFF7F", 23 | plot_each_step = FALSE 24 | ) 25 | } 26 | \arguments{ 27 | \item{optim}{Torch optimizer function.} 28 | 29 | \item{...}{Additional parameters (passed to \code{image} function).} 30 | 31 | \item{opt_hparams}{A list with optimizer initialization parameters (default: \code{list()}). 32 | If missing, for each optimizer its individual defaults will be used.} 33 | 34 | \item{test_fn}{A test function (default \code{"beale"}). You can also pass 35 | a list with 2 elements. The first should be a function that will be optimized 36 | and the second is a function that returns a named vector with \code{x0}, \code{y0} 37 | (the starting points) and \code{xmax}, \code{xmin}, \code{ymax} and \code{ymin} (the domain). 38 | An example: \code{c(x0 = x0, y0 = y0, xmax = 5, xmin = -5, ymax = 5, ymin = -5)}} 39 | 40 | \item{steps}{Number of steps to run (default \code{200}).} 41 | 42 | \item{pt_start_color}{Starting point color (default \code{"#5050FF7F"})} 43 | 44 | \item{pt_end_color}{Ending point color (default \code{"#FF5050FF"})} 45 | 46 | \item{ln_color}{Line path color (default \code{"#FF0000FF"})} 47 | 48 | \item{ln_weight}{Line path weight (default \code{2})} 49 | 50 | \item{bg_xy_breaks}{Background X and Y resolution (default \code{100})} 51 | 52 | \item{bg_z_breaks}{Background Z resolution (default \code{32})} 53 | 54 | \item{bg_palette}{Background palette (default \code{"viridis"})} 55 | 56 | \item{ct_levels}{Contour levels (default \code{10})} 57 | 58 | \item{ct_labels}{Should show contour labels? (default \code{FALSE})} 59 | 60 | \item{ct_color}{Contour color (default \code{"#FFFFFF7F"})} 61 | 62 | \item{plot_each_step}{Should output each step? (default \code{FALSE})} 63 | } 64 | \value{ 65 | No return value, called for producing animated gifs 66 | } 67 | \description{ 68 | \code{test_optim()} function is useful to visualize how optimizers solve the 69 | minimization problem by showing the convergence path using a test function. 70 | User can choose any test optimization 71 | \href{https://en.wikipedia.org/wiki/Test_functions_for_optimization}{functions} 72 | provided by \code{torchopt}: 73 | 74 | \code{"beale"}, \code{"booth"}, \code{"bukin_n6"}, \code{"easom"}, \code{"goldstein_price"}, 75 | \code{"himmelblau"}, \code{"levi_n13"}, \code{"matyas"}, \code{"rastrigin"}, 76 | \code{"rosenbrock"}, and \code{"sphere"}. 77 | 78 | Besides these functions, users can pass any function that receives two 79 | numerical values and returns a scalar. 80 | 81 | Optimization functions are useful to evaluate characteristics of optimization 82 | algorithms, such as convergence rate, precision, robustness, and performance. 83 | These functions give an idea about the different situations that optimization 84 | algorithms can face. 85 | 86 | Function \code{test_function()} plot the 2D-space of a test optimization function. 87 | } 88 | \author{ 89 | Rolf Simoes, \email{rolf.simoes@inpe.br} 90 | } 91 | -------------------------------------------------------------------------------- /man/torchopt-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/torchopt-package.R 3 | \docType{package} 4 | \name{torchopt-package} 5 | \alias{torchopt} 6 | \alias{torchopt-package} 7 | \title{torchopt: Advanced Optimizers for Torch} 8 | \description{ 9 | Optimizers for 'torch' deep learning library. These functions include recent results published in the literature and are not part of the optimizers offered in 'torch'. Prospective users should test these optimizers with their data, since performance depends on the specific problem being solved. The packages includes the following optimizers: (a) 'adabelief' by Zhuang et al (2020), \href{https://arxiv.org/abs/2010.07468}{arXiv:2010.07468}; (b) 'adabound' by Luo et al.(2019), \href{https://arxiv.org/abs/1902.09843}{arXiv:1902.09843}; (c) 'adahessian' by Yao et al.(2021) \href{https://arxiv.org/abs/2006.00719}{arXiv:2006.00719}; (d) 'adamw' by Loshchilov & Hutter (2019), \href{https://arxiv.org/abs/1711.05101}{arXiv:1711.05101}; (e) 'madgrad' by Defazio and Jelassi (2021), \href{https://arxiv.org/abs/2101.11075}{arXiv:2101.11075}; (f) 'nadam' by Dozat (2019), \url{https://openreview.net/pdf/OM0jvwB8jIp57ZJjtNEZ.pdf}; (g) 'qhadam' by Ma and Yarats(2019), \href{https://arxiv.org/abs/1810.06801}{arXiv:1810.06801}; (h) 'radam' by Liu et al. (2019), \href{https://arxiv.org/abs/1908.03265}{arXiv:1908.03265}; (i) 'swats' by Shekar and Sochee (2018), \href{https://arxiv.org/abs/1712.07628}{arXiv:1712.07628}; (j) 'yogi' by Zaheer et al.(2019), . 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/e-sensing/torchopt/} 15 | } 16 | 17 | } 18 | \author{ 19 | \strong{Maintainer}: Gilberto Camara \email{gilberto.camara@inpe.br} 20 | 21 | Authors: 22 | \itemize{ 23 | \item Rolf Simoes \email{rolf.simoes@inpe.br} 24 | \item Daniel Falbel \email{daniel.falbel@gmail.com} 25 | \item Felipe Souza \email{felipe.carvalho@inpe.br} 26 | \item Alber Sanchez \email{alber.ipia@inpe.br} 27 | } 28 | 29 | } 30 | \keyword{internal} 31 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(torchopt) 3 | test_check("torchopt") 4 | 5 | -------------------------------------------------------------------------------- /tests/testthat/test-optimizers.R: -------------------------------------------------------------------------------- 1 | library(torchopt) 2 | beale <- function(x, y) { 3 | log((1.5 - x + x * y)^2 + (2.25 - x - x * y^2)^2 + (2.625 - x + x * y^3)^2) 4 | } 5 | test_optim_valid <- function(optim, 6 | opt_hparams = list(lr = 0.01), 7 | test_fn = "beale", 8 | steps = 100) { 9 | 10 | # get starting points 11 | domain_fn <- get(paste0("domain_",test_fn), 12 | envir = asNamespace("torchopt"), 13 | inherits = FALSE) 14 | # get gradient function 15 | test_fn <- get(test_fn, 16 | envir = asNamespace("torchopt"), 17 | inherits = FALSE) 18 | 19 | # starting point 20 | dom <- domain_fn() 21 | x0 <- dom[["x0"]] 22 | y0 <- dom[["y0"]] 23 | 24 | # create tensor 25 | x <- torch::torch_tensor(x0, requires_grad = TRUE) 26 | y <- torch::torch_tensor(y0, requires_grad = TRUE) 27 | 28 | # instantiate optimizer 29 | optim <- do.call(optim, c(list(params = list(x, y)), opt_hparams)) 30 | 31 | # run optimizer 32 | x_steps <- numeric(steps) 33 | y_steps <- numeric(steps) 34 | for (i in seq_len(steps)) { 35 | x_steps[i] <- as.numeric(x) 36 | y_steps[i] <- as.numeric(y) 37 | optim$zero_grad() 38 | z <- test_fn(x, y) 39 | z$backward() 40 | optim$step() 41 | } 42 | return(list(x_steps = x_steps, 43 | y_steps = y_steps)) 44 | } 45 | test_that("adamw optimizer", { 46 | testthat::skip_on_cran() 47 | set.seed(12345) 48 | xy <- test_optim_valid( 49 | optim = torchopt::optim_adamw, 50 | opt_hparams = list(lr = 0.05), 51 | steps = 400, 52 | test_fn = "beale" 53 | ) 54 | 55 | x0 <- xy[[1]][1] 56 | y0 <- xy[[2]][1] 57 | x400 <- xy[[1]][400] 58 | y400 <- xy[[2]][400] 59 | test_fn0 <- beale(x0, y0) 60 | test_fn400 <- beale(x400, y400) 61 | 62 | expect_true(test_fn0 > test_fn400) 63 | }) 64 | 65 | test_that("adabelief optimizer", { 66 | testthat::skip_on_cran() 67 | set.seed(12345) 68 | xy <- test_optim_valid( 69 | optim = optim_adabelief, 70 | opt_hparams = list(lr = 0.5), 71 | steps = 400, 72 | test_fn = "beale" 73 | ) 74 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 75 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 76 | 77 | expect_true(test_fn0 > test_fn400) 78 | }) 79 | 80 | test_that("adabound optimizer", { 81 | testthat::skip_on_cran() 82 | set.seed(12345) 83 | xy <- test_optim_valid( 84 | optim = optim_adabound, 85 | opt_hparams = list(lr = 0.5), 86 | steps = 400, 87 | test_fn = "beale" 88 | ) 89 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 90 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 91 | 92 | expect_true(test_fn0 > test_fn400) 93 | 94 | }) 95 | test_that("madgrad optimizer", { 96 | testthat::skip_on_cran() 97 | set.seed(12345) 98 | xy <- test_optim_valid( 99 | optim = optim_madgrad, 100 | opt_hparams = list(lr = 0.1), 101 | steps = 400, 102 | test_fn = "beale" 103 | ) 104 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 105 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 106 | 107 | expect_true(test_fn0 > test_fn400) 108 | 109 | }) 110 | 111 | test_that("nadam optimizer", { 112 | testthat::skip_on_cran() 113 | set.seed(12345) 114 | xy <- test_optim_valid( 115 | optim = optim_nadam, 116 | opt_hparams = list(lr = 0.1), 117 | steps = 400, 118 | test_fn = "beale" 119 | ) 120 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 121 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 122 | 123 | expect_true(test_fn0 > test_fn400) 124 | 125 | }) 126 | test_that("qhadam optimizer", { 127 | testthat::skip_on_cran() 128 | set.seed(12345) 129 | xy <- test_optim_valid( 130 | optim = optim_qhadam, 131 | opt_hparams = list(lr = 0.1), 132 | steps = 400, 133 | test_fn = "beale" 134 | ) 135 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 136 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 137 | 138 | expect_true(test_fn0 > test_fn400) 139 | 140 | }) 141 | test_that("radam optimizer", { 142 | testthat::skip_on_cran() 143 | set.seed(12345) 144 | xy <- test_optim_valid( 145 | optim = optim_radam, 146 | opt_hparams = list(lr = 0.1), 147 | steps = 400, 148 | test_fn = "beale" 149 | ) 150 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 151 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 152 | 153 | expect_true(test_fn0 > test_fn400) 154 | 155 | }) 156 | test_that("swats optimizer", { 157 | testthat::skip_on_cran() 158 | set.seed(234) 159 | xy <- test_optim_valid( 160 | optim = optim_swats, 161 | opt_hparams = list(lr = 0.1), 162 | steps = 400, 163 | test_fn = "beale" 164 | ) 165 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 166 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 167 | 168 | expect_true(test_fn0 > test_fn400) 169 | 170 | }) 171 | test_that("yogi optimizer", { 172 | testthat::skip_on_cran() 173 | set.seed(66) 174 | xy <- test_optim_valid( 175 | optim = optim_yogi, 176 | opt_hparams = list(lr = 0.1), 177 | steps = 400, 178 | test_fn = "beale" 179 | ) 180 | test_fn0 <- beale(xy[[1]][1], xy[[2]][1]) 181 | test_fn400 <- beale(xy[[1]][400], xy[[2]][400]) 182 | 183 | expect_true(test_fn0 > test_fn400) 184 | 185 | }) 186 | -------------------------------------------------------------------------------- /tests/testthat/test-utils-testopt.R: -------------------------------------------------------------------------------- 1 | test_that("can use custom functions with test_opt", { 2 | testthat::skip_on_cran() 3 | set.seed(1) 4 | expect_error(regexp = NA,{ 5 | test_optim( 6 | optim = optim_adamw, 7 | test_fn = list(beale, domain_beale), 8 | opt_hparams = list(lr = 0.05), 9 | steps = 100, 10 | plot_each_step = TRUE 11 | ) 12 | }) 13 | }) 14 | -------------------------------------------------------------------------------- /torchopt.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageCheckArgs: --as-cran 22 | --------------------------------------------------------------------------------