├── .clang-format ├── .gitignore ├── AUTHORS.md ├── CHANGELOG.md ├── CMakeLists.txt ├── LICENSE.md ├── README.md ├── config.cmake.in ├── doxygen └── Doxyfile ├── examples ├── CMakeLists.txt ├── example-quadratic.cpp ├── example-rosenbrock-box.cpp ├── example-rosenbrock-bracketing.cpp ├── example-rosenbrock-comparison.cpp └── example-rosenbrock.cpp └── include ├── LBFGS.h ├── LBFGSB.h └── LBFGSpp ├── BFGSMat.h ├── BKLDLT.h ├── Cauchy.h ├── LineSearchBacktracking.h ├── LineSearchBracketing.h ├── LineSearchMoreThuente.h ├── LineSearchNocedalWright.h ├── Param.h └── SubspaceMin.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: WebKit 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignArrayOfStructures: None 7 | AlignConsecutiveMacros: false 8 | AlignConsecutiveAssignments: false 9 | AlignConsecutiveBitFields: false 10 | AlignConsecutiveDeclarations: false 11 | AlignEscapedNewlines: Left 12 | AlignOperands: false 13 | AlignTrailingComments: true 14 | AllowAllArgumentsOnNextLine: true 15 | AllowAllConstructorInitializersOnNextLine: true 16 | AllowAllParametersOfDeclarationOnNextLine: true 17 | AllowShortEnumsOnASingleLine: true 18 | AllowShortBlocksOnASingleLine: false 19 | AllowShortCaseLabelsOnASingleLine: false 20 | AllowShortFunctionsOnASingleLine: All 21 | AllowShortLambdasOnASingleLine: All 22 | AllowShortIfStatementsOnASingleLine: Never 23 | AllowShortLoopsOnASingleLine: false 24 | AlwaysBreakAfterDefinitionReturnType: None 25 | AlwaysBreakAfterReturnType: None 26 | AlwaysBreakBeforeMultilineStrings: false 27 | AlwaysBreakTemplateDeclarations: MultiLine 28 | AttributeMacros: 29 | - __capability 30 | BinPackArguments: true 31 | BinPackParameters: true 32 | BraceWrapping: 33 | AfterCaseLabel: true 34 | AfterClass: true 35 | AfterControlStatement: true 36 | AfterEnum: true 37 | AfterFunction: true 38 | AfterNamespace: false 39 | AfterObjCDeclaration: false 40 | AfterStruct: true 41 | AfterUnion: true 42 | AfterExternBlock: false 43 | BeforeCatch: true 44 | BeforeElse: true 45 | BeforeLambdaBody: false 46 | BeforeWhile: false 47 | IndentBraces: false 48 | SplitEmptyFunction: false 49 | SplitEmptyRecord: false 50 | SplitEmptyNamespace: false 51 | BreakBeforeBinaryOperators: None 52 | BreakBeforeConceptDeclarations: true 53 | BreakBeforeBraces: Custom 54 | BreakBeforeInheritanceComma: false 55 | BreakInheritanceList: AfterColon 56 | BreakBeforeTernaryOperators: false 57 | BreakConstructorInitializersBeforeComma: false 58 | BreakConstructorInitializers: AfterColon 59 | BreakAfterJavaFieldAnnotations: false 60 | BreakStringLiterals: true 61 | ColumnLimit: 0 62 | CommentPragmas: '^ IWYU pragma:' 63 | CompactNamespaces: false 64 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 65 | ConstructorInitializerIndentWidth: 4 66 | ContinuationIndentWidth: 4 67 | Cpp11BracedListStyle: true 68 | DeriveLineEnding: true 69 | DerivePointerAlignment: true 70 | DisableFormat: false 71 | EmptyLineAfterAccessModifier: Never 72 | EmptyLineBeforeAccessModifier: LogicalBlock 73 | ExperimentalAutoDetectBinPacking: false 74 | FixNamespaceComments: true 75 | ForEachMacros: 76 | - foreach 77 | - Q_FOREACH 78 | - BOOST_FOREACH 79 | IfMacros: 80 | - KJ_IF_MAYBE 81 | IncludeBlocks: Preserve 82 | IncludeCategories: 83 | - Regex: '^"(llvm|llvm-c|clang|clang-c)/' 84 | Priority: 2 85 | SortPriority: 0 86 | CaseSensitive: false 87 | - Regex: '^(<|"(gtest|gmock|isl|json)/)' 88 | Priority: 3 89 | SortPriority: 0 90 | CaseSensitive: false 91 | - Regex: '.*' 92 | Priority: 1 93 | SortPriority: 0 94 | CaseSensitive: false 95 | IncludeIsMainRegex: '(Test)?$' 96 | IncludeIsMainSourceRegex: '' 97 | IndentAccessModifiers: false 98 | IndentCaseLabels: false 99 | IndentCaseBlocks: false 100 | IndentGotoLabels: true 101 | IndentPPDirectives: None 102 | IndentExternBlock: AfterExternBlock 103 | IndentRequires: false 104 | IndentWidth: 4 105 | IndentWrappedFunctionNames: false 106 | InsertTrailingCommas: None 107 | JavaScriptQuotes: Leave 108 | JavaScriptWrapImports: true 109 | KeepEmptyLinesAtTheStartOfBlocks: false 110 | LambdaBodyIndentation: Signature 111 | MacroBlockBegin: '' 112 | MacroBlockEnd: '' 113 | MaxEmptyLinesToKeep: 1 114 | NamespaceIndentation: Inner 115 | ObjCBinPackProtocolList: Auto 116 | ObjCBlockIndentWidth: 4 117 | ObjCBreakBeforeNestedBlockParam: true 118 | ObjCSpaceAfterProperty: true 119 | ObjCSpaceBeforeProtocolList: true 120 | PenaltyBreakAssignment: 2 121 | PenaltyBreakBeforeFirstCallParameter: 19 122 | PenaltyBreakComment: 300 123 | PenaltyBreakFirstLessLess: 120 124 | PenaltyBreakString: 1000 125 | PenaltyBreakTemplateDeclaration: 10 126 | PenaltyExcessCharacter: 1000000 127 | PenaltyReturnTypeOnItsOwnLine: 60 128 | PenaltyIndentedWhitespace: 0 129 | PointerAlignment: Left 130 | PPIndentWidth: -1 131 | ReferenceAlignment: Pointer 132 | ReflowComments: true 133 | ShortNamespaceLines: 1 134 | SortIncludes: false 135 | SortJavaStaticImport: Before 136 | SortUsingDeclarations: false 137 | SpaceAfterCStyleCast: true 138 | SpaceAfterLogicalNot: false 139 | SpaceAfterTemplateKeyword: true 140 | SpaceBeforeAssignmentOperators: true 141 | SpaceBeforeCaseColon: false 142 | SpaceBeforeCpp11BracedList: false 143 | SpaceBeforeCtorInitializerColon: true 144 | SpaceBeforeInheritanceColon: true 145 | SpaceBeforeParens: ControlStatements 146 | SpaceAroundPointerQualifiers: Default 147 | SpaceBeforeRangeBasedForLoopColon: true 148 | SpaceInEmptyBlock: false 149 | SpaceInEmptyParentheses: false 150 | SpacesBeforeTrailingComments: 2 151 | SpacesInAngles: Never 152 | SpacesInConditionalStatement: false 153 | SpacesInContainerLiterals: true 154 | SpacesInCStyleCastParentheses: false 155 | SpacesInLineCommentPrefix: 156 | Minimum: 1 157 | Maximum: -1 158 | SpacesInParentheses: false 159 | SpacesInSquareBrackets: false 160 | SpaceBeforeSquareBrackets: false 161 | BitFieldColonSpacing: Both 162 | Standard: c++03 163 | StatementAttributeLikeMacros: 164 | - Q_EMIT 165 | StatementMacros: 166 | - Q_UNUSED 167 | - QT_REQUIRE_VERSION 168 | TabWidth: 8 169 | UseCRLF: false 170 | UseTab: Never 171 | WhitespaceSensitiveMacros: 172 | - STRINGIZE 173 | - PP_STRINGIZE 174 | - BOOST_PP_STRINGIZE 175 | - NS_SWIFT_NAME 176 | - CF_SWIFT_NAME 177 | ... 178 | 179 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.out 3 | include/Eigen/* 4 | archive/* 5 | issues/* 6 | .settings/* 7 | .project 8 | .cproject 9 | /Debug/ 10 | /Release/ 11 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | The LBFGS++ library was adapted from the libLBFGS library 2 | (https://github.com/chokkan/liblbfgs), written by 3 | Naoaki Okazaki <>. 4 | 5 | The files 6 | 7 | - `include/LBFGS/LineSearchBracketing.h` 8 | - `include/LBFGS/LineSearchNocedalWright.h` 9 | 10 | were contributed by Dirk Toewe <>. 11 | 12 | Other part of LBFGS++ was written by Yixuan Qiu <>. 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [0.4.0] - 2025-04-20 2 | 3 | ### Added 4 | 5 | - Added functions `final_approx_hessian()` and `final_approx_inverse_hessian()` to `LBFGSSolver` 6 | to retrieve the final approximate Hessian information 7 | ([#42](https://github.com/yixuan/LBFGSpp/issues/42), [#43](https://github.com/yixuan/LBFGSpp/issues/43)) 8 | - Added CMake scripts to build examples ([#38](https://github.com/yixuan/LBFGSpp/pull/38)), 9 | contributed by [@pjknowles](https://github.com/pjknowles) 10 | 11 | 12 | 13 | ## [0.3.0] - 2023-09-06 14 | 15 | ### Added 16 | 17 | - Added functions `final_grad()` and `final_grad_norm()` to `LBFGSSolver` 18 | and `LBFGSBSolver` to retrieve the final gradient information 19 | ([#12](https://github.com/yixuan/LBFGSpp/issues/12)) 20 | 21 | ### Changed 22 | 23 | - `LBFGS++` now requires C++11 24 | - The line search classes now have a unified API for both `LBFGSSolver` and `LBFGSBSolver` 25 | - The Moré-Thuente line search algorithm `LineSearchMoreThuente` now can also be used 26 | in the L-BFGS solver `LBFGSSolver` 27 | - Improved the numerical stability of `LineSearchNocedalWright` 28 | ([#27](https://github.com/yixuan/LBFGSpp/issues/27)) 29 | - Removed the unused variable `dg_hi` in `LineSearchNocedalWright` 30 | ([#35](https://github.com/yixuan/LBFGSpp/issues/35)) 31 | - Fixed some compiler warnings regarding shadowed variables 32 | ([#36](https://github.com/yixuan/LBFGSpp/issues/36)) 33 | 34 | 35 | 36 | ## [0.2.0] - 2022-05-20 37 | 38 | ### Added 39 | 40 | - Added a CMake script for installation ([#24](https://github.com/yixuan/LBFGSpp/pull/24)), 41 | contributed by [@steinmig](https://github.com/steinmig) 42 | 43 | ### Changed 44 | 45 | - The default line search method for `LBFGSSolver` has been changed from `LineSearchBacktracking` 46 | to `LineSearchNocedalWright`, per the suggestion of [@mpayrits](https://github.com/mpayrits) 47 | ([#25](https://github.com/yixuan/LBFGSpp/pull/25)) 48 | - Fixed a few critical issues ([#9](https://github.com/yixuan/LBFGSpp/issues/9), 49 | [#15](https://github.com/yixuan/LBFGSpp/issues/15), 50 | [#21](https://github.com/yixuan/LBFGSpp/issues/21)), with big thanks to 51 | [@mpayrits](https://github.com/mpayrits) ([#25](https://github.com/yixuan/LBFGSpp/pull/25)) 52 | - Fixed one inconsistency with Moré and Thuente (1994) in the `LineSearchMoreThuente` 53 | line search algorithm, pointed out by [@mpayrits](https://github.com/mpayrits) 54 | ([#23](https://github.com/yixuan/LBFGSpp/issues/23)) 55 | - The source code is now formatted using [Clang-Format](https://clang.llvm.org/docs/ClangFormat.html) 56 | 57 | 58 | 59 | ## [0.1.0] - 2021-08-19 60 | 61 | ### Added 62 | 63 | - Initial Github release 64 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | project(lbfgspp VERSION 0.4.0 LANGUAGES CXX) 3 | 4 | # + ----------------- + 5 | # | BUILDING SETTINGS | 6 | # + ----------------- + 7 | 8 | if(NOT CMAKE_BUILD_TYPE) 9 | set(CMAKE_BUILD_TYPE Release) 10 | endif(NOT CMAKE_BUILD_TYPE) 11 | 12 | # + ----------------- + 13 | # | COMPILATION FLAGS | 14 | # + ----------------- + 15 | 16 | include(CheckCXXCompilerFlag) 17 | check_cxx_compiler_flag(-Wall WALL_SUPPORTED) 18 | if(${WALL_SUPPORTED}) 19 | add_compile_options(-Wall) 20 | endif() 21 | 22 | # + --------------- + 23 | # | LBFGSpp LIBRARY | 24 | # + --------------- + 25 | 26 | add_library(lbfgspp INTERFACE) 27 | 28 | # + -------- + 29 | # | INCLUDES | 30 | # + -------- + 31 | 32 | target_include_directories(lbfgspp INTERFACE 33 | $ 34 | $/include> 35 | ) 36 | 37 | # + ----------------------- + 38 | # | FIND EXTERNAL LIBRARIES | 39 | # + ----------------------- + 40 | 41 | find_package(Eigen3 3.0 REQUIRED) 42 | target_link_libraries(lbfgspp INTERFACE Eigen3::Eigen) 43 | message("-- Eigen3 version: " ${EIGEN3_VERSION_STRING}) 44 | 45 | # + ------------ + 46 | # | INSTALLATION | 47 | # + ------------ + 48 | 49 | # Copy headers folder 50 | install( 51 | DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ 52 | DESTINATION include 53 | ) 54 | # Create an export set 55 | install(TARGETS lbfgspp EXPORT lbfgsppTargets) 56 | 57 | include(CMakePackageConfigHelpers) 58 | # Version file 59 | write_basic_package_version_file( 60 | ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config-version.cmake 61 | VERSION ${PROJECT_VERSION} 62 | COMPATIBILITY AnyNewerVersion 63 | ) 64 | # Config file 65 | configure_package_config_file( 66 | ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake.in 67 | ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config.cmake 68 | INSTALL_DESTINATION lib/cmake/lbfgspp 69 | ) 70 | # Targets files 71 | export( 72 | EXPORT lbfgsppTargets 73 | FILE ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-targets.cmake 74 | ) 75 | install( 76 | EXPORT lbfgsppTargets 77 | FILE lbfgspp-targets.cmake 78 | DESTINATION lib/cmake/lbfgspp 79 | ) 80 | install( 81 | FILES 82 | ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config.cmake 83 | ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config-version.cmake 84 | DESTINATION lib/cmake/lbfgspp 85 | ) 86 | 87 | add_subdirectory(examples) 88 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## The MIT License 2 | 3 | Copyright (c) 1990 Jorge Nocedal 4 | 5 | Copyright (c) 2007-2010 Naoaki Okazaki 6 | 7 | Copyright (c) 2016-2023 Yixuan Qiu 8 | 9 | Copyright (c) 2018-2023 Dirk Toewe 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining 12 | a copy of this software and associated documentation files (the 13 | "Software"), to deal in the Software without restriction, including 14 | without limitation the rights to use, copy, modify, merge, publish, 15 | distribute, sublicense, and/or sell copies of the Software, and to 16 | permit persons to whom the Software is furnished to do so, subject to 17 | the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be 20 | included in all copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 26 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 27 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 28 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LBFGS++ LBFGS++ 2 | 3 | > **UPDATE on 2020-03-06**: **LBFGS++** now includes a new L-BFGS-B solver for 4 | > box-constrained optimization problems. Check the example below for its usage. 5 | 6 | **LBFGS++** is a header-only C++ library that implements the Limited-memory 7 | BFGS algorithm (L-BFGS) for unconstrained minimization problems, and a modified 8 | version of the L-BFGS-B algorithm for box-constrained ones. 9 | 10 | The code for the L-BFGS solver is derived and modified from the 11 | [libLBFGS](https://github.com/chokkan/liblbfgs) 12 | library developed by [Naoaki Okazaki](http://www.chokkan.org/). 13 | 14 | **LBFGS++** is implemented as a header-only C++ library, whose only dependency, 15 | [Eigen](http://eigen.tuxfamily.org/), is also header-only. 16 | 17 | ## A Quick Example 18 | 19 | To use **LBFGS++**, one needs to first define a functor to represent the 20 | multivariate function to be minimized. It should return the objective function 21 | value on a vector `x` and overwrite the vector `grad` with the gradient 22 | evaluated on `x`. For example we could define the 23 | [Rosenbrock function](https://en.wikipedia.org/wiki/Rosenbrock_function) in the 24 | following way: 25 | 26 | ```cpp 27 | #include 28 | #include 29 | #include 30 | 31 | using Eigen::VectorXd; 32 | using namespace LBFGSpp; 33 | 34 | class Rosenbrock 35 | { 36 | private: 37 | int n; 38 | public: 39 | Rosenbrock(int n_) : n(n_) {} 40 | double operator()(const VectorXd& x, VectorXd& grad) 41 | { 42 | double fx = 0.0; 43 | for(int i = 0; i < n; i += 2) 44 | { 45 | double t1 = 1.0 - x[i]; 46 | double t2 = 10 * (x[i + 1] - x[i] * x[i]); 47 | grad[i + 1] = 20 * t2; 48 | grad[i] = -2.0 * (x[i] * grad[i + 1] + t1); 49 | fx += t1 * t1 + t2 * t2; 50 | } 51 | return fx; 52 | } 53 | }; 54 | ``` 55 | 56 | Then we just need to set up parameters, create solver object, 57 | provide initial guess, and then run the minimization function. 58 | 59 | ```cpp 60 | int main() 61 | { 62 | const int n = 10; 63 | // Set up parameters 64 | LBFGSParam param; 65 | param.epsilon = 1e-6; 66 | param.max_iterations = 100; 67 | 68 | // Create solver and function object 69 | LBFGSSolver solver(param); 70 | Rosenbrock fun(n); 71 | 72 | // Initial guess 73 | VectorXd x = VectorXd::Zero(n); 74 | // x will be overwritten to be the best point found 75 | double fx; 76 | int niter = solver.minimize(fun, x, fx); 77 | 78 | std::cout << niter << " iterations" << std::endl; 79 | std::cout << "x = \n" << x.transpose() << std::endl; 80 | std::cout << "f(x) = " << fx << std::endl; 81 | 82 | return 0; 83 | } 84 | ``` 85 | 86 | The example can then be compiled and run. 87 | 88 | ```bash 89 | $ g++ -I/path/to/eigen -I/path/to/lbfgspp/include -O2 example.cpp 90 | $ ./a.out 91 | 23 iterations 92 | x = 93 | 1 1 1 1 1 1 1 1 1 1 94 | f(x) = 1.87948e-19 95 | ``` 96 | 97 | You can also use a different line search algorithm by providing a second template parameter 98 | to `LBFGSSolver`. For example, the code below illustrates the bracketing line search algorithm 99 | (contributed by [@DirkToewe](https://github.com/DirkToewe)). 100 | 101 | ```cpp 102 | int main() 103 | { 104 | const int n = 10; 105 | // Set up parameters 106 | LBFGSParam param; 107 | param.epsilon = 1e-6; 108 | param.max_iterations = 100; 109 | 110 | // Create solver and function object 111 | LBFGSSolver solver(param); 112 | Rosenbrock fun(n); 113 | 114 | // Initial guess 115 | VectorXd x = VectorXd::Zero(n); 116 | // x will be overwritten to be the best point found 117 | double fx; 118 | int niter = solver.minimize(fun, x, fx); 119 | 120 | std::cout << niter << " iterations" << std::endl; 121 | std::cout << "x = \n" << x.transpose() << std::endl; 122 | std::cout << "f(x) = " << fx << std::endl; 123 | 124 | return 0; 125 | } 126 | ``` 127 | 128 | ## Box-constrained Problem 129 | 130 | If the parameters to be optimized have simple bounds, then the 131 | L-BFGS-**B** solver class `LBFGSBSolver` can be used. 132 | The code is very similar to that of `LBFGSSolver`. Below is the same Rosenbrock 133 | example, but we require that all variables should be between 2 and 4. 134 | 135 | ```cpp 136 | #include 137 | #include 138 | #include // Note the different header file 139 | 140 | using Eigen::VectorXd; 141 | using namespace LBFGSpp; 142 | 143 | class Rosenbrock 144 | { 145 | private: 146 | int n; 147 | public: 148 | Rosenbrock(int n_) : n(n_) {} 149 | double operator()(const VectorXd& x, VectorXd& grad) 150 | { 151 | double fx = 0.0; 152 | for(int i = 0; i < n; i += 2) 153 | { 154 | double t1 = 1.0 - x[i]; 155 | double t2 = 10 * (x[i + 1] - x[i] * x[i]); 156 | grad[i + 1] = 20 * t2; 157 | grad[i] = -2.0 * (x[i] * grad[i + 1] + t1); 158 | fx += t1 * t1 + t2 * t2; 159 | } 160 | return fx; 161 | } 162 | }; 163 | 164 | int main() 165 | { 166 | const int n = 10; 167 | // Set up parameters 168 | LBFGSBParam param; // New parameter class 169 | param.epsilon = 1e-6; 170 | param.max_iterations = 100; 171 | 172 | // Create solver and function object 173 | LBFGSBSolver solver(param); // New solver class 174 | Rosenbrock fun(n); 175 | 176 | // Bounds 177 | VectorXd lb = VectorXd::Constant(n, 2.0); 178 | VectorXd ub = VectorXd::Constant(n, 4.0); 179 | 180 | // Initial guess 181 | VectorXd x = VectorXd::Constant(n, 3.0); 182 | 183 | // x will be overwritten to be the best point found 184 | double fx; 185 | int niter = solver.minimize(fun, x, fx, lb, ub); 186 | 187 | std::cout << niter << " iterations" << std::endl; 188 | std::cout << "x = \n" << x.transpose() << std::endl; 189 | std::cout << "f(x) = " << fx << std::endl; 190 | 191 | return 0; 192 | } 193 | ``` 194 | 195 | Note that we also allow infinite values for the lower and upper bounds. 196 | In such cases one can define `ub[i] = std::numeric_limits::infinity()`, 197 | for example. 198 | 199 | ## Documentation 200 | 201 | The [API reference](https://lbfgspp.statr.me/doc/) page contains the documentation 202 | of **LBFGS++** generated by [Doxygen](https://www.doxygen.nl/). 203 | 204 | ## License 205 | 206 | **LBFGS++** is an open source project under the MIT license. 207 | -------------------------------------------------------------------------------- /config.cmake.in: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | include(CMakeFindDependencyMacro) 3 | 4 | find_dependency(Eigen3 3.0 REQUIRED) 5 | 6 | include(${CMAKE_CURRENT_LIST_DIR}/lbfgspp-targets.cmake) 7 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | foreach (source example-quadratic.cpp example-rosenbrock-box.cpp example-rosenbrock-bracketing.cpp example-rosenbrock-comparison.cpp example-rosenbrock.cpp) 2 | get_filename_component(example ${source} NAME_WLE) 3 | add_executable(${example} ${source}) 4 | set_property(TARGET ${example} PROPERTY CXX_STANDARD 17) 5 | target_link_libraries(${example} PRIVATE lbfgspp Eigen3::Eigen) 6 | endforeach () 7 | -------------------------------------------------------------------------------- /examples/example-quadratic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using Eigen::VectorXd; 6 | using Eigen::MatrixXd; 7 | using namespace LBFGSpp; 8 | 9 | double foo(const VectorXd& x, VectorXd& grad) 10 | { 11 | const int n = x.size(); 12 | VectorXd d(n); 13 | for(int i = 0; i < n; i++) 14 | d[i] = i; 15 | 16 | double f = (x - d).squaredNorm(); 17 | grad.noalias() = 2.0 * (x - d); 18 | return f; 19 | } 20 | 21 | int main() 22 | { 23 | const int n = 10; 24 | LBFGSParam param; 25 | LBFGSSolver solver(param); 26 | 27 | VectorXd x = VectorXd::Zero(n); 28 | double fx; 29 | int niter = solver.minimize(foo, x, fx); 30 | 31 | std::cout << niter << " iterations" << std::endl; 32 | std::cout << "x = \n" << x.transpose() << std::endl; 33 | std::cout << "f(x) = " << fx << std::endl; 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /examples/example-rosenbrock-box.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace LBFGSpp; 6 | 7 | typedef double Scalar; 8 | typedef Eigen::Matrix Vector; 9 | 10 | // Example from the roptim R package 11 | // f(x) = (x[0] - 1)^2 + 4 * (x[1] - x[0]^2)^2 + ... + 4 * (x[end] - x[end - 1]^2)^2 12 | class Rosenbrock 13 | { 14 | private: 15 | int n; 16 | public: 17 | Rosenbrock(int n_) : n(n_) {} 18 | Scalar operator()(const Vector& x, Vector& grad) 19 | { 20 | Scalar fx = (x[0] - 1.0) * (x[0] - 1.0); 21 | grad[0] = 2 * (x[0] - 1) + 16 * (x[0] * x[0] - x[1]) * x[0]; 22 | for(int i = 1; i < n; i++) 23 | { 24 | fx += 4 * std::pow(x[i] - x[i - 1] * x[i - 1], 2); 25 | if(i == n - 1) 26 | { 27 | grad[i] = 8 * (x[i] - x[i - 1] * x[i - 1]); 28 | } else { 29 | grad[i] = 8 * (x[i] - x[i - 1] * x[i - 1]) + 16 * (x[i] * x[i] - x[i + 1]) * x[i]; 30 | } 31 | } 32 | return fx; 33 | } 34 | }; 35 | 36 | int main() 37 | { 38 | const int n = 25; 39 | LBFGSBParam param; 40 | LBFGSBSolver solver(param); 41 | Rosenbrock fun(n); 42 | 43 | // Variable bounds 44 | Vector lb = Vector::Constant(n, 2.0); 45 | Vector ub = Vector::Constant(n, 4.0); 46 | // The third variable is unbounded 47 | lb[2] = -std::numeric_limits::infinity(); 48 | ub[2] = std::numeric_limits::infinity(); 49 | // Initial values 50 | Vector x = Vector::Constant(n, 3.0); 51 | // Make some initial values at the bounds 52 | x[0] = x[1] = 2.0; 53 | x[5] = x[7] = 4.0; 54 | 55 | Scalar fx; 56 | int niter = solver.minimize(fun, x, fx, lb, ub); 57 | 58 | std::cout << niter << " iterations" << std::endl; 59 | std::cout << "x = \n" << x.transpose() << std::endl; 60 | std::cout << "f(x) = " << fx << std::endl; 61 | std::cout << "grad = " << solver.final_grad().transpose() << std::endl; 62 | std::cout << "projected grad norm = " << solver.final_grad_norm() << std::endl; 63 | 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /examples/example-rosenbrock-bracketing.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using Eigen::VectorXd; 6 | using Eigen::MatrixXd; 7 | using namespace LBFGSpp; 8 | 9 | class Rosenbrock 10 | { 11 | private: 12 | int n; 13 | public: 14 | Rosenbrock(int n_) : n(n_) {} 15 | double operator()(const VectorXd& x, VectorXd& grad) 16 | { 17 | double fx = 0.0; 18 | for(int i = 0; i < n; i += 2) 19 | { 20 | double t1 = 1.0 - x[i]; 21 | double t2 = 10 * (x[i + 1] - x[i] * x[i]); 22 | grad[i + 1] = 20 * t2; 23 | grad[i] = -2.0 * (x[i] * grad[i + 1] + t1); 24 | fx += t1 * t1 + t2 * t2; 25 | } 26 | assert( ! std::isnan(fx) ); 27 | return fx; 28 | } 29 | }; 30 | 31 | int main() 32 | { 33 | LBFGSParam param; 34 | LBFGSSolver solver(param); 35 | 36 | for( int n=2; n <= 16; n += 2 ) 37 | { 38 | std::cout << "n = " << n << std::endl; 39 | Rosenbrock fun(n); 40 | for( int test=0; test < 1024; test++ ) 41 | { 42 | VectorXd x = VectorXd::Random(n); 43 | double fx; 44 | int niter = solver.minimize(fun, x, fx); 45 | 46 | assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() ); 47 | } 48 | std::cout << "Test passed!" << std::endl << std::endl; 49 | } 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /examples/example-rosenbrock-comparison.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using Eigen::VectorXd; 6 | using Eigen::MatrixXd; 7 | using namespace LBFGSpp; 8 | 9 | class Rosenbrock 10 | { 11 | private: 12 | int n; 13 | ptrdiff_t ncalls; 14 | 15 | public: 16 | Rosenbrock(int n_) : n(n_), ncalls(0) {} 17 | double operator()(const VectorXd& x, VectorXd& grad) 18 | { 19 | // std::cout << x << std::endl; 20 | ncalls += 1; 21 | 22 | double fx = 0.0; 23 | for(int i = 0; i < n; i += 2) 24 | { 25 | double t1 = 1.0 - x[i]; 26 | double t2 = 10 * (x[i + 1] - x[i] * x[i]); 27 | grad[i + 1] = 20 * t2; 28 | grad[i] = -2.0 * (x[i] * grad[i + 1] + t1); 29 | fx += t1 * t1 + t2 * t2; 30 | } 31 | assert( ! std::isnan(fx) ); 32 | return fx; 33 | } 34 | 35 | const ptrdiff_t get_ncalls() { 36 | return ncalls; 37 | } 38 | }; 39 | 40 | int main() 41 | { 42 | LBFGSParam param; 43 | param. linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE; 44 | param.max_linesearch = 256; 45 | 46 | LBFGSSolver solver_backtrack(param); 47 | LBFGSSolver solver_bracket (param); 48 | LBFGSSolver solver_nocedal (param); 49 | LBFGSSolver solver_more (param); 50 | 51 | const int tests_per_n = 1024; 52 | 53 | for( int n=2; n <= 24; n += 2 ) 54 | { 55 | std::cout << "n = " << n << std::endl; 56 | Rosenbrock fun_backtrack(n), 57 | fun_bracket (n), 58 | fun_nocedal (n), 59 | fun_more (n); 60 | int niter_backtrack = 0, 61 | niter_bracket = 0, 62 | niter_nocedal = 0, 63 | niter_more = 0; 64 | for( int test=0; test < tests_per_n; test++ ) 65 | { 66 | VectorXd x, x0 = VectorXd::Random(n); 67 | 68 | double fx; 69 | 70 | x = x0; niter_backtrack += solver_backtrack.minimize(fun_backtrack, x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() ); 71 | x = x0; niter_bracket += solver_bracket .minimize(fun_bracket , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() ); 72 | x = x0; niter_nocedal += solver_nocedal .minimize(fun_nocedal , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() ); 73 | x = x0; niter_more += solver_more .minimize(fun_more , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() ); 74 | } 75 | std::cout << " Average #calls:" << std::endl; 76 | std::cout << " LineSearchBacktracking : " << (fun_backtrack.get_ncalls() / tests_per_n) << " calls, " << (niter_backtrack / tests_per_n) << " iterations" << std::endl; 77 | std::cout << " LineSearchBracketing : " << (fun_bracket .get_ncalls() / tests_per_n) << " calls, " << (niter_bracket / tests_per_n) << " iterations" << std::endl; 78 | std::cout << " LineSearchNocedalWright: " << (fun_nocedal .get_ncalls() / tests_per_n) << " calls, " << (niter_nocedal / tests_per_n) << " iterations" << std::endl; 79 | std::cout << " LineSearchMoreThuente: " << (fun_more .get_ncalls() / tests_per_n) << " calls, " << (niter_more / tests_per_n) << " iterations" << std::endl; 80 | } 81 | 82 | return 0; 83 | } 84 | -------------------------------------------------------------------------------- /examples/example-rosenbrock.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using Eigen::VectorXf; 6 | using Eigen::MatrixXf; 7 | using namespace LBFGSpp; 8 | 9 | class Rosenbrock 10 | { 11 | private: 12 | int n; 13 | public: 14 | Rosenbrock(int n_) : n(n_) {} 15 | float operator()(const VectorXf& x, VectorXf& grad) 16 | { 17 | float fx = 0.0; 18 | for(int i = 0; i < n; i += 2) 19 | { 20 | float t1 = 1.0 - x[i]; 21 | float t2 = 10 * (x[i + 1] - x[i] * x[i]); 22 | grad[i + 1] = 20 * t2; 23 | grad[i] = -2.0 * (x[i] * grad[i + 1] + t1); 24 | fx += t1 * t1 + t2 * t2; 25 | } 26 | return fx; 27 | } 28 | }; 29 | 30 | int main() 31 | { 32 | const int n = 10; 33 | LBFGSParam param; 34 | LBFGSSolver solver(param); 35 | Rosenbrock fun(n); 36 | 37 | VectorXf x = VectorXf::Zero(n); 38 | float fx; 39 | int niter = solver.minimize(fun, x, fx); 40 | 41 | std::cout << niter << " iterations" << std::endl; 42 | std::cout << "x = \n" << x.transpose() << std::endl; 43 | std::cout << "f(x) = " << fx << std::endl; 44 | std::cout << "grad = " << solver.final_grad().transpose() << std::endl; 45 | std::cout << "||grad|| = " << solver.final_grad_norm() << std::endl; 46 | std::cout << "approx_hess = \n" << solver.final_approx_hessian() << std::endl; 47 | std::cout << "approx_inv_hess = \n" << solver.final_approx_inverse_hessian() << std::endl; 48 | 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /include/LBFGS.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_LBFGS_H 5 | #define LBFGSPP_LBFGS_H 6 | 7 | #include 8 | #include "LBFGSpp/Param.h" 9 | #include "LBFGSpp/BFGSMat.h" 10 | #include "LBFGSpp/LineSearchBacktracking.h" 11 | #include "LBFGSpp/LineSearchBracketing.h" 12 | #include "LBFGSpp/LineSearchNocedalWright.h" 13 | #include "LBFGSpp/LineSearchMoreThuente.h" 14 | 15 | namespace LBFGSpp { 16 | 17 | /// 18 | /// L-BFGS solver for unconstrained numerical optimization 19 | /// 20 | template class LineSearch = LineSearchNocedalWright> 22 | class LBFGSSolver 23 | { 24 | private: 25 | using Vector = Eigen::Matrix; 26 | using Matrix = Eigen::Matrix; 27 | using MapVec = Eigen::Map; 28 | 29 | const LBFGSParam& m_param; // Parameters to control the LBFGS algorithm 30 | BFGSMat m_bfgs; // Approximation to the Hessian matrix 31 | Vector m_fx; // History of the objective function values 32 | Vector m_xp; // Old x 33 | Vector m_grad; // New gradient 34 | Scalar m_gnorm; // Norm of the gradient 35 | Vector m_gradp; // Old gradient 36 | Vector m_drt; // Moving direction 37 | 38 | // Reset internal variables 39 | // n: dimension of the vector to be optimized 40 | inline void reset(int n) 41 | { 42 | const int m = m_param.m; 43 | m_bfgs.reset(n, m); 44 | m_xp.resize(n); 45 | m_grad.resize(n); 46 | m_gradp.resize(n); 47 | m_drt.resize(n); 48 | if (m_param.past > 0) 49 | m_fx.resize(m_param.past); 50 | } 51 | 52 | public: 53 | /// 54 | /// Constructor for the L-BFGS solver. 55 | /// 56 | /// \param param An object of \ref LBFGSParam to store parameters for the 57 | /// algorithm 58 | /// 59 | LBFGSSolver(const LBFGSParam& param) : 60 | m_param(param) 61 | { 62 | m_param.check_param(); 63 | } 64 | 65 | /// 66 | /// Minimizing a multivariate function using the L-BFGS algorithm. 67 | /// Exceptions will be thrown if error occurs. 68 | /// 69 | /// \param f A function object such that `f(x, grad)` returns the 70 | /// objective function value at `x`, and overwrites `grad` with 71 | /// the gradient. 72 | /// \param x In: An initial guess of the optimal point. Out: The best point 73 | /// found. 74 | /// \param fx Out: The objective function value at `x`. 75 | /// 76 | /// \return Number of iterations used. 77 | /// 78 | template 79 | inline int minimize(Foo& f, Vector& x, Scalar& fx) 80 | { 81 | using std::abs; 82 | 83 | // Dimension of the vector 84 | const int n = x.size(); 85 | reset(n); 86 | 87 | // The length of lag for objective function value to test convergence 88 | const int fpast = m_param.past; 89 | 90 | // Evaluate function and compute gradient 91 | fx = f(x, m_grad); 92 | m_gnorm = m_grad.norm(); 93 | if (fpast > 0) 94 | m_fx[0] = fx; 95 | 96 | // std::cout << "x0 = " << x.transpose() << std::endl; 97 | // std::cout << "f(x0) = " << fx << ", ||grad|| = " << m_gnorm << std::endl << std::endl; 98 | 99 | // Early exit if the initial x is already a minimizer 100 | if (m_gnorm <= m_param.epsilon || m_gnorm <= m_param.epsilon_rel * x.norm()) 101 | { 102 | return 1; 103 | } 104 | 105 | // Initial direction 106 | m_drt.noalias() = -m_grad; 107 | // Initial step size 108 | Scalar step = Scalar(1) / m_drt.norm(); 109 | // Tolerance for s'y >= eps * (y'y) 110 | constexpr Scalar eps = std::numeric_limits::epsilon(); 111 | // s and y vectors 112 | Vector vecs(n), vecy(n); 113 | 114 | // Number of iterations used 115 | int k = 1; 116 | for (;;) 117 | { 118 | // std::cout << "Iter " << k << " begins" << std::endl << std::endl; 119 | 120 | // Save the curent x and gradient 121 | m_xp.noalias() = x; 122 | m_gradp.noalias() = m_grad; 123 | Scalar dg = m_grad.dot(m_drt); 124 | const Scalar step_max = m_param.max_step; 125 | 126 | // Line search to update x, fx and gradient 127 | LineSearch::LineSearch(f, m_param, m_xp, m_drt, step_max, step, fx, m_grad, dg, x); 128 | 129 | // New gradient norm 130 | m_gnorm = m_grad.norm(); 131 | 132 | // std::cout << "Iter " << k << " finished line search" << std::endl; 133 | // std::cout << " x = " << x.transpose() << std::endl; 134 | // std::cout << " f(x) = " << fx << ", ||grad|| = " << m_gnorm << std::endl << std::endl; 135 | 136 | // Convergence test -- gradient 137 | if (m_gnorm <= m_param.epsilon || m_gnorm <= m_param.epsilon_rel * x.norm()) 138 | { 139 | return k; 140 | } 141 | // Convergence test -- objective function value 142 | if (fpast > 0) 143 | { 144 | const Scalar fxd = m_fx[k % fpast]; 145 | if (k >= fpast && abs(fxd - fx) <= m_param.delta * std::max(std::max(abs(fx), abs(fxd)), Scalar(1))) 146 | return k; 147 | 148 | m_fx[k % fpast] = fx; 149 | } 150 | // Maximum number of iterations 151 | if (m_param.max_iterations != 0 && k >= m_param.max_iterations) 152 | { 153 | return k; 154 | } 155 | 156 | // Update s and y 157 | // s_{k+1} = x_{k+1} - x_k 158 | // y_{k+1} = g_{k+1} - g_k 159 | vecs.noalias() = x - m_xp; 160 | vecy.noalias() = m_grad - m_gradp; 161 | if (vecs.dot(vecy) > eps * vecy.squaredNorm()) 162 | m_bfgs.add_correction(vecs, vecy); 163 | 164 | // Recursive formula to compute d = -H * g 165 | m_bfgs.apply_Hv(m_grad, -Scalar(1), m_drt); 166 | 167 | // Reset step = 1.0 as initial guess for the next line search 168 | step = Scalar(1); 169 | k++; 170 | } 171 | 172 | return k; 173 | } 174 | 175 | /// 176 | /// Returning the gradient vector on the last iterate. 177 | /// Typically used to debug and test convergence. 178 | /// Should only be called after the `minimize()` function. 179 | /// 180 | /// \return A const reference to the gradient vector. 181 | /// 182 | const Vector& final_grad() const { return m_grad; } 183 | 184 | /// 185 | /// Returning the Euclidean norm of the final gradient. 186 | /// 187 | Scalar final_grad_norm() const { return m_gnorm; } 188 | 189 | /// 190 | /// Returning the approximate Hessian matrix on the last iterate. 191 | /// 192 | Matrix final_approx_hessian() const { return m_bfgs.get_Bmat(); } 193 | 194 | /// 195 | /// Returning the approximate inverse Hessian matrix on the last iterate. 196 | /// 197 | Matrix final_approx_inverse_hessian() const { return m_bfgs.get_Hmat(); } 198 | }; 199 | 200 | } // namespace LBFGSpp 201 | 202 | #endif // LBFGSPP_LBFGS_H 203 | -------------------------------------------------------------------------------- /include/LBFGSB.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_LBFGSB_H 5 | #define LBFGSPP_LBFGSB_H 6 | 7 | #include // std::invalid_argument 8 | #include 9 | #include 10 | #include "LBFGSpp/Param.h" 11 | #include "LBFGSpp/BFGSMat.h" 12 | #include "LBFGSpp/Cauchy.h" 13 | #include "LBFGSpp/SubspaceMin.h" 14 | #include "LBFGSpp/LineSearchMoreThuente.h" 15 | 16 | namespace LBFGSpp { 17 | 18 | /// 19 | /// L-BFGS-B solver for box-constrained numerical optimization 20 | /// 21 | template class LineSearch = LineSearchMoreThuente> 23 | class LBFGSBSolver 24 | { 25 | private: 26 | using Vector = Eigen::Matrix; 27 | using Matrix = Eigen::Matrix; 28 | using MapVec = Eigen::Map; 29 | using IndexSet = std::vector; 30 | 31 | const LBFGSBParam& m_param; // Parameters to control the LBFGS algorithm 32 | BFGSMat m_bfgs; // Approximation to the Hessian matrix 33 | Vector m_fx; // History of the objective function values 34 | Vector m_xp; // Old x 35 | Vector m_grad; // New gradient 36 | Scalar m_projgnorm; // Projected gradient norm 37 | Vector m_gradp; // Old gradient 38 | Vector m_drt; // Moving direction 39 | 40 | // Reset internal variables 41 | // n: dimension of the vector to be optimized 42 | inline void reset(int n) 43 | { 44 | const int m = m_param.m; 45 | m_bfgs.reset(n, m); 46 | m_xp.resize(n); 47 | m_grad.resize(n); 48 | m_gradp.resize(n); 49 | m_drt.resize(n); 50 | if (m_param.past > 0) 51 | m_fx.resize(m_param.past); 52 | } 53 | 54 | // Project the vector x to the bound constraint set 55 | static void force_bounds(Vector& x, const Vector& lb, const Vector& ub) 56 | { 57 | x.noalias() = x.cwiseMax(lb).cwiseMin(ub); 58 | } 59 | 60 | // Norm of the projected gradient 61 | // ||P(x-g, l, u) - x||_inf 62 | static Scalar proj_grad_norm(const Vector& x, const Vector& g, const Vector& lb, const Vector& ub) 63 | { 64 | return ((x - g).cwiseMax(lb).cwiseMin(ub) - x).cwiseAbs().maxCoeff(); 65 | } 66 | 67 | // The maximum step size alpha such that x0 + alpha * d stays within the bounds 68 | static Scalar max_step_size(const Vector& x0, const Vector& drt, const Vector& lb, const Vector& ub) 69 | { 70 | const int n = x0.size(); 71 | Scalar step = std::numeric_limits::infinity(); 72 | 73 | for (int i = 0; i < n; i++) 74 | { 75 | if (drt[i] > Scalar(0)) 76 | { 77 | step = std::min(step, (ub[i] - x0[i]) / drt[i]); 78 | } 79 | else if (drt[i] < Scalar(0)) 80 | { 81 | step = std::min(step, (lb[i] - x0[i]) / drt[i]); 82 | } 83 | } 84 | 85 | return step; 86 | } 87 | 88 | public: 89 | /// 90 | /// Constructor for the L-BFGS-B solver. 91 | /// 92 | /// \param param An object of \ref LBFGSParam to store parameters for the 93 | /// algorithm 94 | /// 95 | LBFGSBSolver(const LBFGSBParam& param) : 96 | m_param(param) 97 | { 98 | m_param.check_param(); 99 | } 100 | 101 | /// 102 | /// Minimizing a multivariate function subject to box constraints, using the L-BFGS-B algorithm. 103 | /// Exceptions will be thrown if error occurs. 104 | /// 105 | /// \param f A function object such that `f(x, grad)` returns the 106 | /// objective function value at `x`, and overwrites `grad` with 107 | /// the gradient. 108 | /// \param x In: An initial guess of the optimal point. Out: The best point 109 | /// found. 110 | /// \param fx Out: The objective function value at `x`. 111 | /// \param lb Lower bounds for `x`. 112 | /// \param ub Upper bounds for `x`. 113 | /// 114 | /// \return Number of iterations used. 115 | /// 116 | template 117 | inline int minimize(Foo& f, Vector& x, Scalar& fx, const Vector& lb, const Vector& ub) 118 | { 119 | using std::abs; 120 | 121 | // Dimension of the vector 122 | const int n = x.size(); 123 | if (lb.size() != n || ub.size() != n) 124 | throw std::invalid_argument("'lb' and 'ub' must have the same size as 'x'"); 125 | 126 | // Check whether the initial vector is within the bounds 127 | // If not, project to the feasible set 128 | force_bounds(x, lb, ub); 129 | 130 | // Initialization 131 | reset(n); 132 | 133 | // The length of lag for objective function value to test convergence 134 | const int fpast = m_param.past; 135 | 136 | // Evaluate function and compute gradient 137 | fx = f(x, m_grad); 138 | m_projgnorm = proj_grad_norm(x, m_grad, lb, ub); 139 | if (fpast > 0) 140 | m_fx[0] = fx; 141 | 142 | // std::cout << "x0 = " << x.transpose() << std::endl; 143 | // std::cout << "f(x0) = " << fx << ", ||proj_grad|| = " << m_projgnorm << std::endl << std::endl; 144 | 145 | // Early exit if the initial x is already a minimizer 146 | if (m_projgnorm <= m_param.epsilon || m_projgnorm <= m_param.epsilon_rel * x.norm()) 147 | { 148 | return 1; 149 | } 150 | 151 | // Compute generalized Cauchy point 152 | Vector xcp(n), vecc; 153 | IndexSet newact_set, fv_set; 154 | Cauchy::get_cauchy_point(m_bfgs, x, m_grad, lb, ub, xcp, vecc, newact_set, fv_set); 155 | 156 | /* Vector gcp(n); 157 | Scalar fcp = f(xcp, gcp); 158 | Scalar projgcpnorm = proj_grad_norm(xcp, gcp, lb, ub); 159 | std::cout << "xcp = " << xcp.transpose() << std::endl; 160 | std::cout << "f(xcp) = " << fcp << ", ||proj_grad|| = " << projgcpnorm << std::endl << std::endl; */ 161 | 162 | // Initial direction 163 | m_drt.noalias() = xcp - x; 164 | m_drt.normalize(); 165 | // Tolerance for s'y >= eps * (y'y) 166 | constexpr Scalar eps = std::numeric_limits::epsilon(); 167 | // s and y vectors 168 | Vector vecs(n), vecy(n); 169 | // Number of iterations used 170 | int k = 1; 171 | for (;;) 172 | { 173 | // Save the curent x and gradient 174 | m_xp.noalias() = x; 175 | m_gradp.noalias() = m_grad; 176 | Scalar dg = m_grad.dot(m_drt); 177 | 178 | // Maximum step size to make x feasible 179 | Scalar step_max = max_step_size(x, m_drt, lb, ub); 180 | 181 | // In some cases, the direction returned by the subspace minimization procedure 182 | // in the previous iteration is pathological, leading to issues such as 183 | // step_max~=0 and dg>=0. If this happens, we use xcp-x as the search direction, 184 | // and reset the BFGS matrix. This is because xsm (the subspace minimizer) 185 | // heavily depends on the BFGS matrix. If xsm is corrupted, then we may suspect 186 | // there is something wrong in the BFGS matrix, and it is safer to reset the matrix. 187 | // In contrast, xcp is obtained from a line search, which tends to be more robust 188 | if (dg >= Scalar(0) || step_max <= m_param.min_step) 189 | { 190 | // Reset search direction 191 | m_drt.noalias() = xcp - x; 192 | // Reset BFGS matrix 193 | m_bfgs.reset(n, m_param.m); 194 | // Recompute dg and step_max 195 | dg = m_grad.dot(m_drt); 196 | step_max = max_step_size(x, m_drt, lb, ub); 197 | } 198 | 199 | // Line search to update x, fx and gradient 200 | step_max = std::min(m_param.max_step, step_max); 201 | Scalar step = Scalar(1); 202 | step = std::min(step, step_max); 203 | LineSearch::LineSearch(f, m_param, m_xp, m_drt, step_max, step, fx, m_grad, dg, x); 204 | 205 | // New projected gradient norm 206 | m_projgnorm = proj_grad_norm(x, m_grad, lb, ub); 207 | 208 | /* std::cout << "** Iteration " << k << std::endl; 209 | std::cout << " x = " << x.transpose() << std::endl; 210 | std::cout << " f(x) = " << fx << ", ||proj_grad|| = " << m_projgnorm << std::endl << std::endl; */ 211 | 212 | // Convergence test -- gradient 213 | if (m_projgnorm <= m_param.epsilon || m_projgnorm <= m_param.epsilon_rel * x.norm()) 214 | { 215 | return k; 216 | } 217 | // Convergence test -- objective function value 218 | if (fpast > 0) 219 | { 220 | const Scalar fxd = m_fx[k % fpast]; 221 | if (k >= fpast && abs(fxd - fx) <= m_param.delta * std::max(std::max(abs(fx), abs(fxd)), Scalar(1))) 222 | return k; 223 | 224 | m_fx[k % fpast] = fx; 225 | } 226 | // Maximum number of iterations 227 | if (m_param.max_iterations != 0 && k >= m_param.max_iterations) 228 | { 229 | return k; 230 | } 231 | 232 | // Update s and y 233 | // s_{k+1} = x_{k+1} - x_k 234 | // y_{k+1} = g_{k+1} - g_k 235 | vecs.noalias() = x - m_xp; 236 | vecy.noalias() = m_grad - m_gradp; 237 | if (vecs.dot(vecy) > eps * vecy.squaredNorm()) 238 | m_bfgs.add_correction(vecs, vecy); 239 | 240 | force_bounds(x, lb, ub); 241 | Cauchy::get_cauchy_point(m_bfgs, x, m_grad, lb, ub, xcp, vecc, newact_set, fv_set); 242 | 243 | /*Vector gcp(n); 244 | Scalar fcp = f(xcp, gcp); 245 | Scalar projgcpnorm = proj_grad_norm(xcp, gcp, lb, ub); 246 | std::cout << "xcp = " << xcp.transpose() << std::endl; 247 | std::cout << "f(xcp) = " << fcp << ", ||proj_grad|| = " << projgcpnorm << std::endl << std::endl;*/ 248 | 249 | SubspaceMin::subspace_minimize(m_bfgs, x, xcp, m_grad, lb, ub, 250 | vecc, newact_set, fv_set, m_param.max_submin, m_drt); 251 | 252 | /*Vector gsm(n); 253 | Scalar fsm = f(x + m_drt, gsm); 254 | Scalar projgsmnorm = proj_grad_norm(x + m_drt, gsm, lb, ub); 255 | std::cout << "xsm = " << (x + m_drt).transpose() << std::endl; 256 | std::cout << "f(xsm) = " << fsm << ", ||proj_grad|| = " << projgsmnorm << std::endl << std::endl;*/ 257 | 258 | k++; 259 | } 260 | 261 | return k; 262 | } 263 | 264 | /// 265 | /// Returning the gradient vector on the last iterate. 266 | /// Typically used to debug and test convergence. 267 | /// Should only be called after the `minimize()` function. 268 | /// 269 | /// \return A const reference to the gradient vector. 270 | /// 271 | const Vector& final_grad() const { return m_grad; } 272 | 273 | /// 274 | /// Returning the infinity norm of the final projected gradient. 275 | /// The projected gradient is defined as \f$P(x-g,l,u)-x\f$, where \f$P(v,l,u)\f$ stands for 276 | /// the projection of a vector \f$v\f$ onto the box specified by the lower bound vector \f$l\f$ and 277 | /// upper bound vector \f$u\f$. 278 | /// 279 | Scalar final_grad_norm() const { return m_projgnorm; } 280 | }; 281 | 282 | } // namespace LBFGSpp 283 | 284 | #endif // LBFGSPP_LBFGSB_H 285 | -------------------------------------------------------------------------------- /include/LBFGSpp/BFGSMat.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_BFGS_MAT_H 5 | #define LBFGSPP_BFGS_MAT_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include "BKLDLT.h" 11 | 12 | /// \cond 13 | 14 | namespace LBFGSpp { 15 | 16 | // 17 | // An *implicit* representation of the BFGS approximation to the Hessian matrix 18 | // 19 | // B = theta * I - W * M * W' -- approximation to Hessian matrix, see [2] 20 | // H = inv(B) -- approximation to inverse Hessian matrix, see [2] 21 | // 22 | // Reference: 23 | // [1] D. C. Liu and J. Nocedal (1989). On the limited memory BFGS method for large scale optimization. 24 | // [2] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization. 25 | // 26 | template 27 | class BFGSMat 28 | { 29 | private: 30 | using Vector = Eigen::Matrix; 31 | using Matrix = Eigen::Matrix; 32 | using RefConstVec = Eigen::Ref; 33 | using IndexSet = std::vector; 34 | 35 | int m_m; // Maximum number of correction vectors 36 | Scalar m_theta; // theta * I is the initial approximation to the Hessian matrix 37 | Matrix m_s; // History of the s vectors 38 | Matrix m_y; // History of the y vectors 39 | Vector m_ys; // History of the s'y values 40 | Vector m_alpha; // Temporary values used in computing H * v 41 | int m_ncorr; // Number of correction vectors in the history, m_ncorr <= m 42 | int m_ptr; // A Pointer to locate the most recent history, 1 <= m_ptr <= m 43 | // Details: s and y vectors are stored in cyclic order. 44 | // For example, if the current s-vector is stored in m_s[, m-1], 45 | // then in the next iteration m_s[, 0] will be overwritten. 46 | // m_s[, m_ptr-1] points to the most recent history (if ncorr > 0), 47 | // and m_s[, m_ptr % m] points to the location that will be 48 | // overwritten next time. 49 | 50 | //========== The following members are only used in L-BFGS-B algorithm ==========// 51 | Matrix m_permMinv; // Permutated M inverse 52 | BKLDLT m_permMsolver; // Represents the permutated M matrix 53 | 54 | public: 55 | // Constructor 56 | BFGSMat() {} 57 | 58 | // Reset internal variables 59 | // n: dimension of the vector to be optimized 60 | // m: maximum number of corrections to approximate the Hessian matrix 61 | inline void reset(int n, int m) 62 | { 63 | m_m = m; 64 | m_theta = Scalar(1); 65 | m_s.resize(n, m); 66 | m_y.resize(n, m); 67 | m_ys.resize(m); 68 | m_alpha.resize(m); 69 | m_ncorr = 0; 70 | m_ptr = m; // This makes sure that m_ptr % m == 0 in the first step 71 | 72 | if (LBFGSB) 73 | { 74 | m_permMinv.resize(2 * m, 2 * m); 75 | m_permMinv.setZero(); 76 | m_permMinv.diagonal().setOnes(); 77 | } 78 | } 79 | 80 | // Add correction vectors to the BFGS matrix 81 | inline void add_correction(const RefConstVec& s, const RefConstVec& y) 82 | { 83 | const int loc = m_ptr % m_m; 84 | 85 | m_s.col(loc).noalias() = s; 86 | m_y.col(loc).noalias() = y; 87 | 88 | // ys = y's = 1/rho 89 | const Scalar ys = m_s.col(loc).dot(m_y.col(loc)); 90 | m_ys[loc] = ys; 91 | 92 | m_theta = m_y.col(loc).squaredNorm() / ys; 93 | 94 | if (m_ncorr < m_m) 95 | m_ncorr++; 96 | 97 | m_ptr = loc + 1; 98 | 99 | if (LBFGSB) 100 | { 101 | // Minv = [-D L'] 102 | // [ L theta*S'S] 103 | 104 | // Copy -D 105 | // Let S=[s[0], ..., s[m-1]], Y=[y[0], ..., y[m-1]] 106 | // D = [s[0]'y[0], ..., s[m-1]'y[m-1]] 107 | m_permMinv(loc, loc) = -ys; 108 | 109 | // Update S'S 110 | // We only store S'S in Minv, and multiply theta when LU decomposition is performed 111 | Vector Ss = m_s.leftCols(m_ncorr).transpose() * m_s.col(loc); 112 | m_permMinv.block(m_m + loc, m_m, 1, m_ncorr).noalias() = Ss.transpose(); 113 | m_permMinv.block(m_m, m_m + loc, m_ncorr, 1).noalias() = Ss; 114 | 115 | // Compute L 116 | // L = [ 0 ] 117 | // [ s[1]'y[0] 0 ] 118 | // [ s[2]'y[0] s[2]'y[1] ] 119 | // ... 120 | // [s[m-1]'y[0] ... ... ... ... ... s[m-1]'y[m-2] 0] 121 | // 122 | // L_next = [ 0 ] 123 | // [s[2]'y[1] 0 ] 124 | // [s[3]'y[1] s[3]'y[2] ] 125 | // ... 126 | // [s[m]'y[1] ... ... ... ... ... s[m]'y[m-1] 0] 127 | const int len = m_ncorr - 1; 128 | // First zero out the column of oldest y 129 | if (m_ncorr >= m_m) 130 | m_permMinv.block(m_m, loc, m_m, 1).setZero(); 131 | // Compute the row associated with new s 132 | // The current row is loc 133 | // End with column (loc + m - 1) % m 134 | // Length is len 135 | int yloc = (loc + m_m - 1) % m_m; 136 | for (int i = 0; i < len; i++) 137 | { 138 | m_permMinv(m_m + loc, yloc) = m_s.col(loc).dot(m_y.col(yloc)); 139 | yloc = (yloc + m_m - 1) % m_m; 140 | } 141 | 142 | // Matrix LDLT factorization 143 | m_permMinv.block(m_m, m_m, m_m, m_m) *= m_theta; 144 | m_permMsolver.compute(m_permMinv); 145 | m_permMinv.block(m_m, m_m, m_m, m_m) /= m_theta; 146 | } 147 | } 148 | 149 | // Explicitly form the B matrix 150 | inline Matrix get_Bmat() const 151 | { 152 | // Initial approximation theta * I 153 | const int n = m_s.rows(); 154 | Matrix B = m_theta * Matrix::Identity(n, n); 155 | if (m_ncorr < 1) 156 | return B; 157 | 158 | // Construct W matrix, W = [Y, theta * S] 159 | // Y = [y0, y1, ..., yc] 160 | // S = [s0, s1, ..., sc] 161 | // We first set W = [Y, S], since later we still need Y and S matrices 162 | // After computing Minv, we rescale the S part in W 163 | Matrix W(n, 2 * m_ncorr); 164 | // r = m_ptr - 1 points to the most recent element, 165 | // (r + 1) % m_ncorr points to the oldest element 166 | int j = m_ptr % m_ncorr; 167 | for (int i = 0; i < m_ncorr; i++) 168 | { 169 | W.col(i).noalias() = m_y.col(j); 170 | W.col(m_ncorr + i).noalias() = m_s.col(j); 171 | j = (j + 1) % m_m; 172 | } 173 | // Now Y = W[:, :c], S = W[:, c:] 174 | 175 | // Construct Minv matrix, Minv = [-D L' ] 176 | // [ L theta * S'S] 177 | 178 | // D = diag(y0's0, ..., yc'sc) 179 | Matrix Minv(2 * m_ncorr, 2 * m_ncorr); 180 | Minv.topLeftCorner(m_ncorr, m_ncorr).setZero(); 181 | Vector ys = W.leftCols(m_ncorr).cwiseProduct(W.rightCols(m_ncorr)).colwise().sum().transpose(); 182 | Minv.diagonal().head(m_ncorr).noalias() = -ys; 183 | // L = [ 0 ] 184 | // [ s[1]'y[0] 0 ] 185 | // [ s[2]'y[0] s[2]'y[1] ] 186 | // ... 187 | // [s[c-1]'y[0] ... ... ... ... ... s[c-1]'y[c-2] 0] 188 | Minv.bottomLeftCorner(m_ncorr, m_ncorr).setZero(); 189 | for (int i = 0; i < m_ncorr - 1; i++) 190 | { 191 | // Number of terms for this column 192 | const int nterm = m_ncorr - i - 1; 193 | // S[:, -nterm:]'Y[:, j] 194 | Minv.col(i).tail(nterm).noalias() = W.rightCols(nterm).transpose() * W.col(i); 195 | } 196 | // The symmetric block 197 | Minv.topRightCorner(m_ncorr, m_ncorr).noalias() = Minv.bottomLeftCorner(m_ncorr, m_ncorr).transpose(); 198 | // theta * S'S 199 | Minv.bottomRightCorner(m_ncorr, m_ncorr).noalias() = m_theta * W.rightCols(m_ncorr).transpose() * W.rightCols(m_ncorr); 200 | 201 | // Set the true W matrix 202 | W.rightCols(m_ncorr).array() *= m_theta; 203 | 204 | // Compute B = theta * I - W * M * W' 205 | Eigen::PartialPivLU M_solver(Minv); 206 | B.noalias() -= W * M_solver.solve(W.transpose()); 207 | return B; 208 | } 209 | 210 | // Explicitly form the H matrix 211 | inline Matrix get_Hmat() const 212 | { 213 | // Initial approximation 1/theta * I 214 | const int n = m_s.rows(); 215 | Matrix H = (Scalar(1) / m_theta) * Matrix::Identity(n, n); 216 | if (m_ncorr < 1) 217 | return H; 218 | 219 | // Construct W matrix, W = [1/theta * Y, S] 220 | // Y = [y0, y1, ..., yc] 221 | // S = [s0, s1, ..., sc] 222 | // We first set W = [Y, S], since later we still need Y and S matrices 223 | // After computing M, we rescale the Y part in W 224 | Matrix W(n, 2 * m_ncorr); 225 | // p = m_ptr - 1 points to the most recent element, 226 | // (p + 1) % m_ncorr points to the oldest element 227 | int j = m_ptr % m_ncorr; 228 | for (int i = 0; i < m_ncorr; i++) 229 | { 230 | W.col(i).noalias() = m_y.col(j); 231 | W.col(m_ncorr + i).noalias() = m_s.col(j); 232 | j = (j + 1) % m_m; 233 | } 234 | // Now Y = W[:, :c], S = W[:, c:] 235 | 236 | // Construct M matrix, M = [ 0 -inv(R) ] 237 | // [ -inv(R)' inv(R)'(D + 1/theta * Y'Y)inv(R) ] 238 | // D = diag(y0's0, ..., yc'sc) 239 | Matrix M(2 * m_ncorr, 2 * m_ncorr); 240 | // First use M[:c, :c] to store R 241 | // R = [s[0]'y[0] s[0]'y[1] ... s[0]'y[c-1] ] 242 | // [ 0 s[1]'y[1] ... s[1]'y[c-1] ] 243 | // ... 244 | // [ 0 0 ... s[c-1]'y[c-1] ] 245 | for (int i = 0; i < m_ncorr; i++) 246 | { 247 | M.col(i).head(i + 1).noalias() = W.middleCols(m_ncorr, i + 1).transpose() * W.col(i); 248 | } 249 | // Compute inv(R) 250 | Matrix Rinv = M.topLeftCorner(m_ncorr, m_ncorr).template triangularView().solve(Matrix::Identity(m_ncorr, m_ncorr)); 251 | // Zero out the top left block 252 | M.topLeftCorner(m_ncorr, m_ncorr).setZero(); 253 | // Set the top right block 254 | M.topRightCorner(m_ncorr, m_ncorr).noalias() = -Rinv; 255 | // The symmetric block 256 | M.bottomLeftCorner(m_ncorr, m_ncorr).noalias() = -Rinv.transpose(); 257 | // 1/theta * Y'Y 258 | Matrix block = (Scalar(1) / m_theta) * W.leftCols(m_ncorr).transpose() * W.leftCols(m_ncorr); 259 | // D + 1/theta * Y'Y 260 | Vector ys = W.leftCols(m_ncorr).cwiseProduct(W.rightCols(m_ncorr)).colwise().sum().transpose(); 261 | block.diagonal().array() += ys.array(); 262 | // The bottom right block 263 | M.bottomRightCorner(m_ncorr, m_ncorr).noalias() = Rinv.transpose() * block * Rinv; 264 | 265 | // Set the true W matrix 266 | W.leftCols(m_ncorr).array() *= (Scalar(1) / m_theta); 267 | 268 | // Compute H = 1/theta * I + W * M * W' 269 | H.noalias() += W * M * W.transpose(); 270 | return H; 271 | } 272 | 273 | // Recursive formula to compute a * H * v, where a is a scalar, and v is [n x 1] 274 | // H0 = (1/theta) * I is the initial approximation to H 275 | // Algorithm 7.4 of Nocedal, J., & Wright, S. (2006). Numerical optimization. 276 | inline void apply_Hv(const Vector& v, const Scalar& a, Vector& res) 277 | { 278 | res.resize(v.size()); 279 | 280 | // L-BFGS two-loop recursion 281 | 282 | // Loop 1 283 | res.noalias() = a * v; 284 | int j = m_ptr % m_m; 285 | for (int i = 0; i < m_ncorr; i++) 286 | { 287 | j = (j + m_m - 1) % m_m; 288 | m_alpha[j] = m_s.col(j).dot(res) / m_ys[j]; 289 | res.noalias() -= m_alpha[j] * m_y.col(j); 290 | } 291 | 292 | // Apply initial H0 293 | res /= m_theta; 294 | 295 | // Loop 2 296 | for (int i = 0; i < m_ncorr; i++) 297 | { 298 | const Scalar beta = m_y.col(j).dot(res) / m_ys[j]; 299 | res.noalias() += (m_alpha[j] - beta) * m_s.col(j); 300 | j = (j + 1) % m_m; 301 | } 302 | } 303 | 304 | //========== The following functions are only used in L-BFGS-B algorithm ==========// 305 | 306 | // Return the value of theta 307 | inline Scalar theta() const { return m_theta; } 308 | 309 | // Return current number of correction vectors 310 | inline int num_corrections() const { return m_ncorr; } 311 | 312 | // W = [Y, theta * S] 313 | // W [n x (2*ncorr)], v [n x 1], res [(2*ncorr) x 1] 314 | // res preserves the ordering of Y and S columns 315 | inline void apply_Wtv(const Vector& v, Vector& res) const 316 | { 317 | res.resize(2 * m_ncorr); 318 | res.head(m_ncorr).noalias() = m_y.leftCols(m_ncorr).transpose() * v; 319 | res.tail(m_ncorr).noalias() = m_theta * m_s.leftCols(m_ncorr).transpose() * v; 320 | } 321 | 322 | // The b-th row of the W matrix 323 | // Preserves the ordering of Y and S columns 324 | // Return as a column vector 325 | inline Vector Wb(int b) const 326 | { 327 | Vector res(2 * m_ncorr); 328 | for (int j = 0; j < m_ncorr; j++) 329 | { 330 | res[j] = m_y(b, j); 331 | res[m_ncorr + j] = m_s(b, j); 332 | } 333 | res.tail(m_ncorr) *= m_theta; 334 | return res; 335 | } 336 | 337 | // Extract rows of W 338 | inline Matrix Wb(const IndexSet& b) const 339 | { 340 | const int nb = b.size(); 341 | const int* bptr = b.data(); 342 | Matrix res(nb, 2 * m_ncorr); 343 | 344 | for (int j = 0; j < m_ncorr; j++) 345 | { 346 | const Scalar* Yptr = &m_y(0, j); 347 | const Scalar* Sptr = &m_s(0, j); 348 | Scalar* resYptr = res.data() + j * nb; 349 | Scalar* resSptr = resYptr + m_ncorr * nb; 350 | for (int i = 0; i < nb; i++) 351 | { 352 | const int row = bptr[i]; 353 | resYptr[i] = Yptr[row]; 354 | resSptr[i] = Sptr[row]; 355 | } 356 | } 357 | return res; 358 | } 359 | 360 | // M is [(2*ncorr) x (2*ncorr)], v is [(2*ncorr) x 1] 361 | inline void apply_Mv(const Vector& v, Vector& res) const 362 | { 363 | res.resize(2 * m_ncorr); 364 | if (m_ncorr < 1) 365 | return; 366 | 367 | Vector vpadding = Vector::Zero(2 * m_m); 368 | vpadding.head(m_ncorr).noalias() = v.head(m_ncorr); 369 | vpadding.segment(m_m, m_ncorr).noalias() = v.tail(m_ncorr); 370 | 371 | // Solve linear equation 372 | m_permMsolver.solve_inplace(vpadding); 373 | 374 | res.head(m_ncorr).noalias() = vpadding.head(m_ncorr); 375 | res.tail(m_ncorr).noalias() = vpadding.segment(m_m, m_ncorr); 376 | } 377 | 378 | // Compute W'Pv 379 | // W [n x (2*ncorr)], v [nP x 1], res [(2*ncorr) x 1] 380 | // res preserves the ordering of Y and S columns 381 | // Returns false if the result is known to be zero 382 | inline bool apply_WtPv(const IndexSet& P_set, const Vector& v, Vector& res, bool test_zero = false) const 383 | { 384 | const int* Pptr = P_set.data(); 385 | const Scalar* vptr = v.data(); 386 | int nP = P_set.size(); 387 | 388 | // Remove zeros in v to save computation 389 | IndexSet P_reduced; 390 | std::vector v_reduced; 391 | if (test_zero) 392 | { 393 | P_reduced.reserve(nP); 394 | for (int i = 0; i < nP; i++) 395 | { 396 | if (vptr[i] != Scalar(0)) 397 | { 398 | P_reduced.push_back(Pptr[i]); 399 | v_reduced.push_back(vptr[i]); 400 | } 401 | } 402 | Pptr = P_reduced.data(); 403 | vptr = v_reduced.data(); 404 | nP = P_reduced.size(); 405 | } 406 | 407 | res.resize(2 * m_ncorr); 408 | if (m_ncorr < 1 || nP < 1) 409 | { 410 | res.setZero(); 411 | return false; 412 | } 413 | 414 | for (int j = 0; j < m_ncorr; j++) 415 | { 416 | Scalar resy = Scalar(0), ress = Scalar(0); 417 | const Scalar* yptr = &m_y(0, j); 418 | const Scalar* sptr = &m_s(0, j); 419 | for (int i = 0; i < nP; i++) 420 | { 421 | const int row = Pptr[i]; 422 | resy += yptr[row] * vptr[i]; 423 | ress += sptr[row] * vptr[i]; 424 | } 425 | res[j] = resy; 426 | res[m_ncorr + j] = ress; 427 | } 428 | res.tail(m_ncorr) *= m_theta; 429 | return true; 430 | } 431 | 432 | // Compute s * P'WMv 433 | // Assume that v[2*ncorr x 1] has the same ordering (permutation) as W and M 434 | // Returns false if the result is known to be zero 435 | inline bool apply_PtWMv(const IndexSet& P_set, const Vector& v, Vector& res, const Scalar& scale) const 436 | { 437 | const int nP = P_set.size(); 438 | res.resize(nP); 439 | res.setZero(); 440 | if (m_ncorr < 1 || nP < 1) 441 | return false; 442 | 443 | Vector Mv; 444 | apply_Mv(v, Mv); 445 | // WP * Mv 446 | Mv.tail(m_ncorr) *= m_theta; 447 | for (int j = 0; j < m_ncorr; j++) 448 | { 449 | const Scalar* yptr = &m_y(0, j); 450 | const Scalar* sptr = &m_s(0, j); 451 | const Scalar Mvy = Mv[j], Mvs = Mv[m_ncorr + j]; 452 | for (int i = 0; i < nP; i++) 453 | { 454 | const int row = P_set[i]; 455 | res[i] += Mvy * yptr[row] + Mvs * sptr[row]; 456 | } 457 | } 458 | res *= scale; 459 | return true; 460 | } 461 | // If the P'W matrix has been explicitly formed, do a direct matrix multiplication 462 | inline bool apply_PtWMv(const Matrix& WP, const Vector& v, Vector& res, const Scalar& scale) const 463 | { 464 | const int nP = WP.rows(); 465 | res.resize(nP); 466 | if (m_ncorr < 1 || nP < 1) 467 | { 468 | res.setZero(); 469 | return false; 470 | } 471 | 472 | Vector Mv; 473 | apply_Mv(v, Mv); 474 | // WP * Mv 475 | Mv.tail(m_ncorr) *= m_theta; 476 | res.noalias() = scale * (WP * Mv); 477 | return true; 478 | } 479 | 480 | // Compute F'BAb = -(F'W)M(W'AA'd) 481 | // W'd is known, and AA'+FF'=I, so W'AA'd = W'd - W'FF'd 482 | // Usually d contains many zeros, so we fist compute number of nonzero elements in A set and F set, 483 | // denoted as nnz_act and nnz_fv, respectively 484 | // If nnz_act is smaller, compute W'AA'd = WA' (A'd) directly 485 | // If nnz_fv is smaller, compute W'AA'd = W'd - WF' * (F'd) 486 | inline void compute_FtBAb( 487 | const Matrix& WF, const IndexSet& fv_set, const IndexSet& newact_set, const Vector& Wd, const Vector& drt, 488 | Vector& res) const 489 | { 490 | const int nact = newact_set.size(); 491 | const int nfree = WF.rows(); 492 | res.resize(nfree); 493 | if (m_ncorr < 1 || nact < 1 || nfree < 1) 494 | { 495 | res.setZero(); 496 | return; 497 | } 498 | 499 | // W'AA'd 500 | Vector rhs(2 * m_ncorr); 501 | if (nact <= nfree) 502 | { 503 | // Construct A'd 504 | Vector Ad(nfree); 505 | for (int i = 0; i < nact; i++) 506 | Ad[i] = drt[newact_set[i]]; 507 | apply_WtPv(newact_set, Ad, rhs); 508 | } 509 | else 510 | { 511 | // Construct F'd 512 | Vector Fd(nfree); 513 | for (int i = 0; i < nfree; i++) 514 | Fd[i] = drt[fv_set[i]]; 515 | // Compute W'AA'd = W'd - WF' * (F'd) 516 | rhs.noalias() = WF.transpose() * Fd; 517 | rhs.tail(m_ncorr) *= m_theta; 518 | rhs.noalias() = Wd - rhs; 519 | } 520 | 521 | apply_PtWMv(WF, rhs, res, Scalar(-1)); 522 | } 523 | 524 | // Compute inv(P'BP) * v 525 | // P represents an index set 526 | // inv(P'BP) * v = v / theta + WP * inv(inv(M) - WP' * WP / theta) * WP' * v / theta^2 527 | // 528 | // v is [nP x 1] 529 | inline void solve_PtBP(const Matrix& WP, const Vector& v, Vector& res) const 530 | { 531 | const int nP = WP.rows(); 532 | res.resize(nP); 533 | if (m_ncorr < 1 || nP < 1) 534 | { 535 | res.noalias() = v / m_theta; 536 | return; 537 | } 538 | 539 | // Compute the matrix in the middle (only the lower triangular part is needed) 540 | // Remember that W = [Y, theta * S], but we do not store theta in WP 541 | Matrix mid(2 * m_ncorr, 2 * m_ncorr); 542 | // [0:(ncorr - 1), 0:(ncorr - 1)] 543 | for (int j = 0; j < m_ncorr; j++) 544 | { 545 | mid.col(j).segment(j, m_ncorr - j).noalias() = m_permMinv.col(j).segment(j, m_ncorr - j) - 546 | WP.block(0, j, nP, m_ncorr - j).transpose() * WP.col(j) / m_theta; 547 | } 548 | // [ncorr:(2 * ncorr - 1), 0:(ncorr - 1)] 549 | mid.block(m_ncorr, 0, m_ncorr, m_ncorr).noalias() = m_permMinv.block(m_m, 0, m_ncorr, m_ncorr) - 550 | WP.rightCols(m_ncorr).transpose() * WP.leftCols(m_ncorr); 551 | // [ncorr:(2 * ncorr - 1), ncorr:(2 * ncorr - 1)] 552 | for (int j = 0; j < m_ncorr; j++) 553 | { 554 | mid.col(m_ncorr + j).segment(m_ncorr + j, m_ncorr - j).noalias() = m_theta * 555 | (m_permMinv.col(m_m + j).segment(m_m + j, m_ncorr - j) - WP.rightCols(m_ncorr - j).transpose() * WP.col(m_ncorr + j)); 556 | } 557 | // Factorization 558 | BKLDLT midsolver(mid); 559 | // Compute the final result 560 | Vector WPv = WP.transpose() * v; 561 | WPv.tail(m_ncorr) *= m_theta; 562 | midsolver.solve_inplace(WPv); 563 | WPv.tail(m_ncorr) *= m_theta; 564 | res.noalias() = v / m_theta + (WP * WPv) / (m_theta * m_theta); 565 | } 566 | 567 | // Compute P'BQv, where P and Q are two mutually exclusive index selection operators 568 | // P'BQv = -WP * M * WQ' * v 569 | // Returns false if the result is known to be zero 570 | inline bool apply_PtBQv(const Matrix& WP, const IndexSet& Q_set, const Vector& v, Vector& res, bool test_zero = false) const 571 | { 572 | const int nP = WP.rows(); 573 | const int nQ = Q_set.size(); 574 | res.resize(nP); 575 | if (m_ncorr < 1 || nP < 1 || nQ < 1) 576 | { 577 | res.setZero(); 578 | return false; 579 | } 580 | 581 | Vector WQtv; 582 | bool nonzero = apply_WtPv(Q_set, v, WQtv, test_zero); 583 | if (!nonzero) 584 | { 585 | res.setZero(); 586 | return false; 587 | } 588 | 589 | Vector MWQtv; 590 | apply_Mv(WQtv, MWQtv); 591 | MWQtv.tail(m_ncorr) *= m_theta; 592 | res.noalias() = -WP * MWQtv; 593 | return true; 594 | } 595 | // If the Q'W matrix has been explicitly formed, do a direct matrix multiplication 596 | inline bool apply_PtBQv(const Matrix& WP, const Matrix& WQ, const Vector& v, Vector& res) const 597 | { 598 | const int nP = WP.rows(); 599 | const int nQ = WQ.rows(); 600 | res.resize(nP); 601 | if (m_ncorr < 1 || nP < 1 || nQ < 1) 602 | { 603 | res.setZero(); 604 | return false; 605 | } 606 | 607 | // Remember that W = [Y, theta * S], so we need to multiply theta to the second half 608 | Vector WQtv = WQ.transpose() * v; 609 | WQtv.tail(m_ncorr) *= m_theta; 610 | Vector MWQtv; 611 | apply_Mv(WQtv, MWQtv); 612 | MWQtv.tail(m_ncorr) *= m_theta; 613 | res.noalias() = -WP * MWQtv; 614 | return true; 615 | } 616 | }; 617 | 618 | } // namespace LBFGSpp 619 | 620 | /// \endcond 621 | 622 | #endif // LBFGSPP_BFGS_MAT_H 623 | -------------------------------------------------------------------------------- /include/LBFGSpp/BKLDLT.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_BK_LDLT_H 5 | #define LBFGSPP_BK_LDLT_H 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | /// \cond 12 | 13 | namespace LBFGSpp { 14 | 15 | enum COMPUTATION_INFO 16 | { 17 | SUCCESSFUL = 0, 18 | NOT_COMPUTED, 19 | NUMERICAL_ISSUE 20 | }; 21 | 22 | // Bunch-Kaufman LDLT decomposition 23 | // References: 24 | // 1. Bunch, J. R., & Kaufman, L. (1977). Some stable methods for calculating inertia and solving symmetric linear systems. 25 | // Mathematics of computation, 31(137), 163-179. 26 | // 2. Golub, G. H., & Van Loan, C. F. (2012). Matrix computations (Vol. 3). JHU press. Section 4.4. 27 | // 3. Bunch-Parlett diagonal pivoting 28 | // 4. Ashcraft, C., Grimes, R. G., & Lewis, J. G. (1998). Accurate symmetric indefinite linear equation solvers. 29 | // SIAM Journal on Matrix Analysis and Applications, 20(2), 513-561. 30 | template 31 | class BKLDLT 32 | { 33 | private: 34 | using Index = Eigen::Index; 35 | using Matrix = Eigen::Matrix; 36 | using Vector = Eigen::Matrix; 37 | using MapVec = Eigen::Map; 38 | using MapConstVec = Eigen::Map; 39 | 40 | using IntVector = Eigen::Matrix; 41 | using GenericVector = Eigen::Ref; 42 | using GenericMatrix = Eigen::Ref; 43 | using ConstGenericMatrix = const Eigen::Ref; 44 | using ConstGenericVector = const Eigen::Ref; 45 | 46 | Index m_n; 47 | Vector m_data; // storage for a lower-triangular matrix 48 | std::vector m_colptr; // pointers to columns 49 | IntVector m_perm; // [-2, -1, 3, 1, 4, 5]: 0 <-> 2, 1 <-> 1, 2 <-> 3, 3 <-> 1, 4 <-> 4, 5 <-> 5 50 | std::vector > m_permc; // compressed version of m_perm: [(0, 2), (2, 3), (3, 1)] 51 | 52 | bool m_computed; 53 | int m_info; 54 | 55 | // Access to elements 56 | // Pointer to the k-th column 57 | Scalar* col_pointer(Index k) { return m_colptr[k]; } 58 | // A[i, j] -> m_colptr[j][i - j], i >= j 59 | Scalar& coeff(Index i, Index j) { return m_colptr[j][i - j]; } 60 | const Scalar& coeff(Index i, Index j) const { return m_colptr[j][i - j]; } 61 | // A[i, i] -> m_colptr[i][0] 62 | Scalar& diag_coeff(Index i) { return m_colptr[i][0]; } 63 | const Scalar& diag_coeff(Index i) const { return m_colptr[i][0]; } 64 | 65 | // Compute column pointers 66 | void compute_pointer() 67 | { 68 | m_colptr.clear(); 69 | m_colptr.reserve(m_n); 70 | Scalar* head = m_data.data(); 71 | 72 | for (Index i = 0; i < m_n; i++) 73 | { 74 | m_colptr.push_back(head); 75 | head += (m_n - i); 76 | } 77 | } 78 | 79 | // Copy mat - shift * I to m_data 80 | void copy_data(ConstGenericMatrix& mat, int uplo, const Scalar& shift) 81 | { 82 | if (uplo == Eigen::Lower) 83 | { 84 | for (Index j = 0; j < m_n; j++) 85 | { 86 | const Scalar* begin = &mat.coeffRef(j, j); 87 | const Index len = m_n - j; 88 | std::copy(begin, begin + len, col_pointer(j)); 89 | diag_coeff(j) -= shift; 90 | } 91 | } 92 | else 93 | { 94 | Scalar* dest = m_data.data(); 95 | for (Index i = 0; i < m_n; i++) 96 | { 97 | for (Index j = i; j < m_n; j++, dest++) 98 | { 99 | *dest = mat.coeff(i, j); 100 | } 101 | diag_coeff(i) -= shift; 102 | } 103 | } 104 | } 105 | 106 | // Compute compressed permutations 107 | void compress_permutation() 108 | { 109 | for (Index i = 0; i < m_n; i++) 110 | { 111 | // Recover the permutation action 112 | const Index perm = (m_perm[i] >= 0) ? (m_perm[i]) : (-m_perm[i] - 1); 113 | if (perm != i) 114 | m_permc.push_back(std::make_pair(i, perm)); 115 | } 116 | } 117 | 118 | // Working on the A[k:end, k:end] submatrix 119 | // Exchange k <-> r 120 | // Assume r >= k 121 | void pivoting_1x1(Index k, Index r) 122 | { 123 | // No permutation 124 | if (k == r) 125 | { 126 | m_perm[k] = r; 127 | return; 128 | } 129 | 130 | // A[k, k] <-> A[r, r] 131 | std::swap(diag_coeff(k), diag_coeff(r)); 132 | 133 | // A[(r+1):end, k] <-> A[(r+1):end, r] 134 | std::swap_ranges(&coeff(r + 1, k), col_pointer(k + 1), &coeff(r + 1, r)); 135 | 136 | // A[(k+1):(r-1), k] <-> A[r, (k+1):(r-1)] 137 | Scalar* src = &coeff(k + 1, k); 138 | for (Index j = k + 1; j < r; j++, src++) 139 | { 140 | std::swap(*src, coeff(r, j)); 141 | } 142 | 143 | m_perm[k] = r; 144 | } 145 | 146 | // Working on the A[k:end, k:end] submatrix 147 | // Exchange [k+1, k] <-> [r, p] 148 | // Assume p >= k, r >= k+1 149 | void pivoting_2x2(Index k, Index r, Index p) 150 | { 151 | pivoting_1x1(k, p); 152 | pivoting_1x1(k + 1, r); 153 | 154 | // A[k+1, k] <-> A[r, k] 155 | std::swap(coeff(k + 1, k), coeff(r, k)); 156 | 157 | // Use negative signs to indicate a 2x2 block 158 | // Also minus one to distinguish a negative zero from a positive zero 159 | m_perm[k] = -m_perm[k] - 1; 160 | m_perm[k + 1] = -m_perm[k + 1] - 1; 161 | } 162 | 163 | // A[r1, c1:c2] <-> A[r2, c1:c2] 164 | // Assume r2 >= r1 > c2 >= c1 165 | void interchange_rows(Index r1, Index r2, Index c1, Index c2) 166 | { 167 | if (r1 == r2) 168 | return; 169 | 170 | for (Index j = c1; j <= c2; j++) 171 | { 172 | std::swap(coeff(r1, j), coeff(r2, j)); 173 | } 174 | } 175 | 176 | // lambda = |A[r, k]| = max{|A[k+1, k]|, ..., |A[end, k]|} 177 | // Largest (in magnitude) off-diagonal element in the first column of the current reduced matrix 178 | // r is the row index 179 | // Assume k < end 180 | Scalar find_lambda(Index k, Index& r) 181 | { 182 | using std::abs; 183 | 184 | const Scalar* head = col_pointer(k); // => A[k, k] 185 | const Scalar* end = col_pointer(k + 1); 186 | // Start with r=k+1, lambda=A[k+1, k] 187 | r = k + 1; 188 | Scalar lambda = abs(head[1]); 189 | // Scan remaining elements 190 | for (const Scalar* ptr = head + 2; ptr < end; ptr++) 191 | { 192 | const Scalar abs_elem = abs(*ptr); 193 | if (lambda < abs_elem) 194 | { 195 | lambda = abs_elem; 196 | r = k + (ptr - head); 197 | } 198 | } 199 | 200 | return lambda; 201 | } 202 | 203 | // sigma = |A[p, r]| = max {|A[k, r]|, ..., |A[end, r]|} \ {A[r, r]} 204 | // Largest (in magnitude) off-diagonal element in the r-th column of the current reduced matrix 205 | // p is the row index 206 | // Assume k < r < end 207 | Scalar find_sigma(Index k, Index r, Index& p) 208 | { 209 | using std::abs; 210 | 211 | // First search A[r+1, r], ..., A[end, r], which has the same task as find_lambda() 212 | // If r == end, we skip this search 213 | Scalar sigma = Scalar(-1); 214 | if (r < m_n - 1) 215 | sigma = find_lambda(r, p); 216 | 217 | // Then search A[k, r], ..., A[r-1, r], which maps to A[r, k], ..., A[r, r-1] 218 | for (Index j = k; j < r; j++) 219 | { 220 | const Scalar abs_elem = abs(coeff(r, j)); 221 | if (sigma < abs_elem) 222 | { 223 | sigma = abs_elem; 224 | p = j; 225 | } 226 | } 227 | 228 | return sigma; 229 | } 230 | 231 | // Generate permutations and apply to A 232 | // Return true if the resulting pivoting is 1x1, and false if 2x2 233 | bool permutate_mat(Index k, const Scalar& alpha) 234 | { 235 | using std::abs; 236 | 237 | Index r = k, p = k; 238 | const Scalar lambda = find_lambda(k, r); 239 | 240 | // If lambda=0, no need to interchange 241 | if (lambda > Scalar(0)) 242 | { 243 | const Scalar abs_akk = abs(diag_coeff(k)); 244 | // If |A[k, k]| >= alpha * lambda, no need to interchange 245 | if (abs_akk < alpha * lambda) 246 | { 247 | const Scalar sigma = find_sigma(k, r, p); 248 | 249 | // If sigma * |A[k, k]| >= alpha * lambda^2, no need to interchange 250 | if (sigma * abs_akk < alpha * lambda * lambda) 251 | { 252 | if (abs_akk >= alpha * sigma) 253 | { 254 | // Permutation on A 255 | pivoting_1x1(k, r); 256 | 257 | // Permutation on L 258 | interchange_rows(k, r, 0, k - 1); 259 | return true; 260 | } 261 | else 262 | { 263 | // There are two versions of permutation here 264 | // 1. A[k+1, k] <-> A[r, k] 265 | // 2. A[k+1, k] <-> A[r, p], where p >= k and r >= k+1 266 | // 267 | // Version 1 and 2 are used by Ref[1] and Ref[2], respectively 268 | 269 | // Version 1 implementation 270 | p = k; 271 | 272 | // Version 2 implementation 273 | // [r, p] and [p, r] are symmetric, but we need to make sure 274 | // p >= k and r >= k+1, so it is safe to always make r > p 275 | // One exception is when min{r,p} == k+1, in which case we make 276 | // r = k+1, so that only one permutation needs to be performed 277 | /* const Index rp_min = std::min(r, p); 278 | const Index rp_max = std::max(r, p); 279 | if(rp_min == k + 1) 280 | { 281 | r = rp_min; p = rp_max; 282 | } else { 283 | r = rp_max; p = rp_min; 284 | } */ 285 | 286 | // Right now we use Version 1 since it reduces the overhead of interchange 287 | 288 | // Permutation on A 289 | pivoting_2x2(k, r, p); 290 | // Permutation on L 291 | interchange_rows(k, p, 0, k - 1); 292 | interchange_rows(k + 1, r, 0, k - 1); 293 | return false; 294 | } 295 | } 296 | } 297 | } 298 | 299 | return true; 300 | } 301 | 302 | // E = [e11, e12] 303 | // [e21, e22] 304 | // Overwrite E with inv(E) 305 | void inverse_inplace_2x2(Scalar& e11, Scalar& e21, Scalar& e22) const 306 | { 307 | // inv(E) = [d11, d12], d11 = e22/delta, d21 = -e21/delta, d22 = e11/delta 308 | // [d21, d22] 309 | const Scalar delta = e11 * e22 - e21 * e21; 310 | std::swap(e11, e22); 311 | e11 /= delta; 312 | e22 /= delta; 313 | e21 = -e21 / delta; 314 | } 315 | 316 | // Return value is the status, SUCCESSFUL/NUMERICAL_ISSUE 317 | int gaussian_elimination_1x1(Index k) 318 | { 319 | // D = 1 / A[k, k] 320 | const Scalar akk = diag_coeff(k); 321 | // Return NUMERICAL_ISSUE if not invertible 322 | if (akk == Scalar(0)) 323 | return NUMERICAL_ISSUE; 324 | 325 | diag_coeff(k) = Scalar(1) / akk; 326 | 327 | // B -= l * l' / A[k, k], B := A[(k+1):end, (k+1):end], l := L[(k+1):end, k] 328 | Scalar* lptr = col_pointer(k) + 1; 329 | const Index ldim = m_n - k - 1; 330 | MapVec l(lptr, ldim); 331 | for (Index j = 0; j < ldim; j++) 332 | { 333 | MapVec(col_pointer(j + k + 1), ldim - j).noalias() -= (lptr[j] / akk) * l.tail(ldim - j); 334 | } 335 | 336 | // l /= A[k, k] 337 | l /= akk; 338 | 339 | return SUCCESSFUL; 340 | } 341 | 342 | // Return value is the status, SUCCESSFUL/NUMERICAL_ISSUE 343 | int gaussian_elimination_2x2(Index k) 344 | { 345 | // D = inv(E) 346 | Scalar& e11 = diag_coeff(k); 347 | Scalar& e21 = coeff(k + 1, k); 348 | Scalar& e22 = diag_coeff(k + 1); 349 | // Return NUMERICAL_ISSUE if not invertible 350 | if (e11 * e22 - e21 * e21 == Scalar(0)) 351 | return NUMERICAL_ISSUE; 352 | 353 | inverse_inplace_2x2(e11, e21, e22); 354 | 355 | // X = l * inv(E), l := L[(k+2):end, k:(k+1)] 356 | Scalar* l1ptr = &coeff(k + 2, k); 357 | Scalar* l2ptr = &coeff(k + 2, k + 1); 358 | const Index ldim = m_n - k - 2; 359 | MapVec l1(l1ptr, ldim), l2(l2ptr, ldim); 360 | 361 | Eigen::Matrix X(ldim, 2); 362 | X.col(0).noalias() = l1 * e11 + l2 * e21; 363 | X.col(1).noalias() = l1 * e21 + l2 * e22; 364 | 365 | // B -= l * inv(E) * l' = X * l', B = A[(k+2):end, (k+2):end] 366 | for (Index j = 0; j < ldim; j++) 367 | { 368 | MapVec(col_pointer(j + k + 2), ldim - j).noalias() -= (X.col(0).tail(ldim - j) * l1ptr[j] + X.col(1).tail(ldim - j) * l2ptr[j]); 369 | } 370 | 371 | // l = X 372 | l1.noalias() = X.col(0); 373 | l2.noalias() = X.col(1); 374 | 375 | return SUCCESSFUL; 376 | } 377 | 378 | public: 379 | BKLDLT() : 380 | m_n(0), m_computed(false), m_info(NOT_COMPUTED) 381 | {} 382 | 383 | // Factorize mat - shift * I 384 | BKLDLT(ConstGenericMatrix& mat, int uplo = Eigen::Lower, const Scalar& shift = Scalar(0)) : 385 | m_n(mat.rows()), m_computed(false), m_info(NOT_COMPUTED) 386 | { 387 | compute(mat, uplo, shift); 388 | } 389 | 390 | void compute(ConstGenericMatrix& mat, int uplo = Eigen::Lower, const Scalar& shift = Scalar(0)) 391 | { 392 | using std::abs; 393 | 394 | m_n = mat.rows(); 395 | if (m_n != mat.cols()) 396 | throw std::invalid_argument("BKLDLT: matrix must be square"); 397 | 398 | m_perm.setLinSpaced(m_n, 0, m_n - 1); 399 | m_permc.clear(); 400 | 401 | // Copy data 402 | m_data.resize((m_n * (m_n + 1)) / 2); 403 | compute_pointer(); 404 | copy_data(mat, uplo, shift); 405 | 406 | const Scalar alpha = (1.0 + std::sqrt(17.0)) / 8.0; 407 | Index k = 0; 408 | for (k = 0; k < m_n - 1; k++) 409 | { 410 | // 1. Interchange rows and columns of A, and save the result to m_perm 411 | bool is_1x1 = permutate_mat(k, alpha); 412 | 413 | // 2. Gaussian elimination 414 | if (is_1x1) 415 | { 416 | m_info = gaussian_elimination_1x1(k); 417 | } 418 | else 419 | { 420 | m_info = gaussian_elimination_2x2(k); 421 | k++; 422 | } 423 | 424 | // 3. Check status 425 | if (m_info != SUCCESSFUL) 426 | break; 427 | } 428 | // Invert the last 1x1 block if it exists 429 | if (k == m_n - 1) 430 | { 431 | const Scalar akk = diag_coeff(k); 432 | if (akk == Scalar(0)) 433 | m_info = NUMERICAL_ISSUE; 434 | 435 | diag_coeff(k) = Scalar(1) / diag_coeff(k); 436 | } 437 | 438 | compress_permutation(); 439 | 440 | m_computed = true; 441 | } 442 | 443 | // Solve Ax=b 444 | void solve_inplace(GenericVector b) const 445 | { 446 | if (!m_computed) 447 | throw std::logic_error("BKLDLT: need to call compute() first"); 448 | 449 | // PAP' = LDL' 450 | // 1. b -> Pb 451 | Scalar* x = b.data(); 452 | MapVec res(x, m_n); 453 | Index npermc = m_permc.size(); 454 | for (Index i = 0; i < npermc; i++) 455 | { 456 | std::swap(x[m_permc[i].first], x[m_permc[i].second]); 457 | } 458 | 459 | // 2. Lz = Pb 460 | // If m_perm[end] < 0, then end with m_n - 3, otherwise end with m_n - 2 461 | const Index end = (m_perm[m_n - 1] < 0) ? (m_n - 3) : (m_n - 2); 462 | for (Index i = 0; i <= end; i++) 463 | { 464 | const Index b1size = m_n - i - 1; 465 | const Index b2size = b1size - 1; 466 | if (m_perm[i] >= 0) 467 | { 468 | MapConstVec l(&coeff(i + 1, i), b1size); 469 | res.segment(i + 1, b1size).noalias() -= l * x[i]; 470 | } 471 | else 472 | { 473 | MapConstVec l1(&coeff(i + 2, i), b2size); 474 | MapConstVec l2(&coeff(i + 2, i + 1), b2size); 475 | res.segment(i + 2, b2size).noalias() -= (l1 * x[i] + l2 * x[i + 1]); 476 | i++; 477 | } 478 | } 479 | 480 | // 3. Dw = z 481 | for (Index i = 0; i < m_n; i++) 482 | { 483 | const Scalar e11 = diag_coeff(i); 484 | if (m_perm[i] >= 0) 485 | { 486 | x[i] *= e11; 487 | } 488 | else 489 | { 490 | const Scalar e21 = coeff(i + 1, i), e22 = diag_coeff(i + 1); 491 | const Scalar wi = x[i] * e11 + x[i + 1] * e21; 492 | x[i + 1] = x[i] * e21 + x[i + 1] * e22; 493 | x[i] = wi; 494 | i++; 495 | } 496 | } 497 | 498 | // 4. L'y = w 499 | // If m_perm[end] < 0, then start with m_n - 3, otherwise start with m_n - 2 500 | Index i = (m_perm[m_n - 1] < 0) ? (m_n - 3) : (m_n - 2); 501 | for (; i >= 0; i--) 502 | { 503 | const Index ldim = m_n - i - 1; 504 | MapConstVec l(&coeff(i + 1, i), ldim); 505 | x[i] -= res.segment(i + 1, ldim).dot(l); 506 | 507 | if (m_perm[i] < 0) 508 | { 509 | MapConstVec l2(&coeff(i + 1, i - 1), ldim); 510 | x[i - 1] -= res.segment(i + 1, ldim).dot(l2); 511 | i--; 512 | } 513 | } 514 | 515 | // 5. x = P'y 516 | for (i = npermc - 1; i >= 0; i--) 517 | { 518 | std::swap(x[m_permc[i].first], x[m_permc[i].second]); 519 | } 520 | } 521 | 522 | Vector solve(ConstGenericVector& b) const 523 | { 524 | Vector res = b; 525 | solve_inplace(res); 526 | return res; 527 | } 528 | 529 | int info() const { return m_info; } 530 | }; 531 | 532 | } // namespace LBFGSpp 533 | 534 | /// \endcond 535 | 536 | #endif // LBFGSPP_BK_LDLT_H 537 | -------------------------------------------------------------------------------- /include/LBFGSpp/Cauchy.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_CAUCHY_H 5 | #define LBFGSPP_CAUCHY_H 6 | 7 | #include 8 | #include 9 | #include "BFGSMat.h" 10 | 11 | /// \cond 12 | 13 | namespace LBFGSpp { 14 | 15 | // 16 | // Class to compute the generalized Cauchy point (GCP) for the L-BFGS-B algorithm, 17 | // mainly for internal use. 18 | // 19 | // The target of the GCP procedure is to find a step size t such that 20 | // x(t) = x0 - t * g is a local minimum of the quadratic function m(x), 21 | // where m(x) is a local approximation to the objective function. 22 | // 23 | // First determine a sequence of break points t0=0, t1, t2, ..., tn. 24 | // On each interval [t[i-1], t[i]], x is changing linearly. 25 | // After passing a break point, one or more coordinates of x will be fixed at the bounds. 26 | // We search the first local minimum of m(x) by examining the intervals [t[i-1], t[i]] sequentially. 27 | // 28 | // Reference: 29 | // [1] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization. 30 | // 31 | template 32 | class ArgSort 33 | { 34 | private: 35 | using Vector = Eigen::Matrix; 36 | using IndexSet = std::vector; 37 | 38 | const Scalar* values; 39 | 40 | public: 41 | ArgSort(const Vector& value_vec) : 42 | values(value_vec.data()) 43 | {} 44 | 45 | inline bool operator()(int key1, int key2) { return values[key1] < values[key2]; } 46 | inline void sort_key(IndexSet& key_vec) const 47 | { 48 | std::sort(key_vec.begin(), key_vec.end(), *this); 49 | } 50 | }; 51 | 52 | template 53 | class Cauchy 54 | { 55 | private: 56 | typedef Eigen::Matrix Vector; 57 | typedef Eigen::Matrix IntVector; 58 | typedef Eigen::Matrix Matrix; 59 | typedef std::vector IndexSet; 60 | 61 | // Find the smallest index i such that brk[ord[i]] > t, assuming brk[ord] is already sorted. 62 | // If the return value equals n, then all values are <= t. 63 | static int search_greater(const Vector& brk, const IndexSet& ord, const Scalar& t, int start = 0) 64 | { 65 | const int nord = ord.size(); 66 | int i; 67 | for (i = start; i < nord; i++) 68 | { 69 | if (brk[ord[i]] > t) 70 | break; 71 | } 72 | 73 | return i; 74 | } 75 | 76 | public: 77 | // bfgs: An object that represents the BFGS approximation matrix. 78 | // x0: Current parameter vector. 79 | // g: Gradient at x0. 80 | // lb: Lower bounds for x. 81 | // ub: Upper bounds for x. 82 | // xcp: The output generalized Cauchy point. 83 | // vecc: c = W'(xcp - x0), used in the subspace minimization routine. 84 | // newact_set: Coordinates that newly become active during the GCP procedure. 85 | // fv_set: Free variable set. 86 | static void get_cauchy_point( 87 | const BFGSMat& bfgs, const Vector& x0, const Vector& g, const Vector& lb, const Vector& ub, 88 | Vector& xcp, Vector& vecc, IndexSet& newact_set, IndexSet& fv_set) 89 | { 90 | // std::cout << "========================= Entering GCP search =========================\n\n"; 91 | 92 | // Initialization 93 | const int n = x0.size(); 94 | xcp.resize(n); 95 | xcp.noalias() = x0; 96 | vecc.resize(2 * bfgs.num_corrections()); 97 | vecc.setZero(); 98 | newact_set.clear(); 99 | newact_set.reserve(n); 100 | fv_set.clear(); 101 | fv_set.reserve(n); 102 | 103 | // Construct break points 104 | Vector brk(n), vecd(n); 105 | // If brk[i] == 0, i belongs to active set 106 | // If brk[i] == Inf, i belongs to free variable set 107 | // Others are currently undecided 108 | IndexSet ord; 109 | ord.reserve(n); 110 | const Scalar inf = std::numeric_limits::infinity(); 111 | for (int i = 0; i < n; i++) 112 | { 113 | if (lb[i] == ub[i]) 114 | brk[i] = Scalar(0); 115 | else if (g[i] < Scalar(0)) 116 | brk[i] = (x0[i] - ub[i]) / g[i]; 117 | else if (g[i] > Scalar(0)) 118 | brk[i] = (x0[i] - lb[i]) / g[i]; 119 | else 120 | brk[i] = inf; 121 | 122 | const bool iszero = (brk[i] == Scalar(0)); 123 | vecd[i] = iszero ? Scalar(0) : -g[i]; 124 | 125 | if (brk[i] == inf) 126 | fv_set.push_back(i); 127 | else if (!iszero) 128 | ord.push_back(i); 129 | } 130 | 131 | // Sort indices of break points 132 | ArgSort sorting(brk); 133 | sorting.sort_key(ord); 134 | 135 | // Break points `brko := brk[ord]` are in increasing order 136 | // `ord` contains the coordinates that define the corresponding break points 137 | // brk[i] == 0 <=> The i-th coordinate is on the boundary 138 | const int nord = ord.size(); 139 | const int nfree = fv_set.size(); 140 | if ((nfree < 1) && (nord < 1)) 141 | { 142 | /* std::cout << "** All coordinates at boundary **\n"; 143 | std::cout << "\n========================= Leaving GCP search =========================\n\n"; */ 144 | return; 145 | } 146 | 147 | // First interval: [il=0, iu=brk[ord[0]]] 148 | // In case ord is empty, we take iu=Inf 149 | 150 | // p = W'd, c = 0 151 | Vector vecp; 152 | bfgs.apply_Wtv(vecd, vecp); 153 | // f' = -d'd 154 | Scalar fp = -vecd.squaredNorm(); 155 | // f'' = -theta * f' - p'Mp 156 | Vector cache; 157 | bfgs.apply_Mv(vecp, cache); // cache = Mp 158 | Scalar fpp = -bfgs.theta() * fp - vecp.dot(cache); 159 | 160 | // Theoretical step size to move 161 | Scalar deltatmin = -fp / fpp; 162 | 163 | // Limit on the current interval 164 | Scalar il = Scalar(0); 165 | // We have excluded the case that max(brk) <= 0 166 | int b = 0; 167 | Scalar iu = (nord < 1) ? inf : brk[ord[b]]; 168 | Scalar deltat = iu - il; 169 | 170 | /* int iter = 0; 171 | std::cout << "** Iter " << iter << " **\n"; 172 | std::cout << " fp = " << fp << ", fpp = " << fpp << ", deltatmin = " << deltatmin << std::endl; 173 | std::cout << " il = " << il << ", iu = " << iu << ", deltat = " << deltat << std::endl; */ 174 | 175 | // If deltatmin >= deltat, we need to do the following things: 176 | // 1. Update vecc 177 | // 2. Since we are going to cross iu, the coordinates that define iu become active 178 | // 3. Update some quantities on these new active coordinates (xcp, vecd, vecp) 179 | // 4. Move to the next interval and compute the new deltatmin 180 | bool crossed_all = false; 181 | const int ncorr = bfgs.num_corrections(); 182 | Vector wact(2 * ncorr); 183 | while (deltatmin >= deltat) 184 | { 185 | // Step 1 186 | vecc.noalias() += deltat * vecp; 187 | 188 | // Step 2 189 | // First check how many coordinates will be active when we cross the previous iu 190 | // b is the smallest number such that brko[b] == iu 191 | // Let bp be the largest number such that brko[bp] == iu 192 | // Then coordinates ord[b] to ord[bp] will be active 193 | const int act_begin = b; 194 | const int act_end = search_greater(brk, ord, iu, b) - 1; 195 | 196 | // If nfree == 0 and act_end == nord-1, then we have crossed all coordinates 197 | // We only need to update xcp from ord[b] to ord[bp], and then exit 198 | if ((nfree == 0) && (act_end == nord - 1)) 199 | { 200 | // std::cout << "** [ "; 201 | for (int i = act_begin; i <= act_end; i++) 202 | { 203 | const int act = ord[i]; 204 | xcp[act] = (vecd[act] > Scalar(0)) ? ub[act] : lb[act]; 205 | newact_set.push_back(act); 206 | // std::cout << act + 1 << " "; 207 | } 208 | // std::cout << "] become active **\n\n"; 209 | // std::cout << "** All break points visited **\n\n"; 210 | 211 | crossed_all = true; 212 | break; 213 | } 214 | 215 | // Step 3 216 | // Update xcp and d on active coordinates 217 | // std::cout << "** [ "; 218 | fp += deltat * fpp; 219 | for (int i = act_begin; i <= act_end; i++) 220 | { 221 | const int act = ord[i]; 222 | xcp[act] = (vecd[act] > Scalar(0)) ? ub[act] : lb[act]; 223 | // z = xcp - x0 224 | const Scalar zact = xcp[act] - x0[act]; 225 | const Scalar gact = g[act]; 226 | const Scalar ggact = gact * gact; 227 | wact.noalias() = bfgs.Wb(act); 228 | bfgs.apply_Mv(wact, cache); // cache = Mw 229 | fp += ggact + bfgs.theta() * gact * zact - gact * cache.dot(vecc); 230 | fpp -= (bfgs.theta() * ggact + 2 * gact * cache.dot(vecp) + ggact * cache.dot(wact)); 231 | vecp.noalias() += gact * wact; 232 | vecd[act] = Scalar(0); 233 | newact_set.push_back(act); 234 | // std::cout << act + 1 << " "; 235 | } 236 | // std::cout << "] become active **\n\n"; 237 | 238 | // Step 4 239 | // Theoretical step size to move 240 | deltatmin = -fp / fpp; 241 | // Update interval bound 242 | il = iu; 243 | b = act_end + 1; 244 | // If we have visited all finite-valued break points, and have not exited earlier, 245 | // then the next iu will be infinity. Simply exit the loop now 246 | if (b >= nord) 247 | break; 248 | iu = brk[ord[b]]; 249 | // Width of the current interval 250 | deltat = iu - il; 251 | 252 | /* iter++; 253 | std::cout << "** Iter " << iter << " **\n"; 254 | std::cout << " fp = " << fp << ", fpp = " << fpp << ", deltatmin = " << deltatmin << std::endl; 255 | std::cout << " il = " << il << ", iu = " << iu << ", deltat = " << deltat << std::endl; */ 256 | } 257 | 258 | // In some rare cases fpp is numerically zero, making deltatmin equal to Inf 259 | // If this happens, force fpp to be the machine precision 260 | const Scalar eps = std::numeric_limits::epsilon(); 261 | if (fpp < eps) 262 | deltatmin = -fp / eps; 263 | 264 | // Last step 265 | if (!crossed_all) 266 | { 267 | deltatmin = std::max(deltatmin, Scalar(0)); 268 | vecc.noalias() += deltatmin * vecp; 269 | const Scalar tfinal = il + deltatmin; 270 | // Update xcp on free variable coordinates 271 | for (int i = 0; i < nfree; i++) 272 | { 273 | const int coord = fv_set[i]; 274 | xcp[coord] = x0[coord] + tfinal * vecd[coord]; 275 | } 276 | for (int i = b; i < nord; i++) 277 | { 278 | const int coord = ord[i]; 279 | xcp[coord] = x0[coord] + tfinal * vecd[coord]; 280 | fv_set.push_back(coord); 281 | } 282 | } 283 | // std::cout << "\n========================= Leaving GCP search =========================\n\n"; 284 | } 285 | }; 286 | 287 | } // namespace LBFGSpp 288 | 289 | /// \endcond 290 | 291 | #endif // LBFGSPP_CAUCHY_H 292 | -------------------------------------------------------------------------------- /include/LBFGSpp/LineSearchBacktracking.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_LINE_SEARCH_BACKTRACKING_H 5 | #define LBFGSPP_LINE_SEARCH_BACKTRACKING_H 6 | 7 | #include 8 | #include // std::runtime_error 9 | #include "Param.h" 10 | 11 | namespace LBFGSpp { 12 | 13 | /// 14 | /// The backtracking line search algorithm for L-BFGS. Mainly for internal use. 15 | /// 16 | template 17 | class LineSearchBacktracking 18 | { 19 | private: 20 | using Vector = Eigen::Matrix; 21 | 22 | public: 23 | /// 24 | /// Line search by backtracking. 25 | /// 26 | /// \param f A function object such that `f(x, grad)` returns the 27 | /// objective function value at `x`, and overwrites `grad` with 28 | /// the gradient. 29 | /// \param param Parameters for the L-BFGS algorithm. 30 | /// \param xp The current point. 31 | /// \param drt The current moving direction. 32 | /// \param step_max The upper bound for the step size that makes x feasible. 33 | /// Can be ignored for the L-BFGS solver. 34 | /// \param step In: The initial step length. 35 | /// Out: The calculated step length. 36 | /// \param fx In: The objective function value at the current point. 37 | /// Out: The function value at the new point. 38 | /// \param grad In: The current gradient vector. 39 | /// Out: The gradient at the new point. 40 | /// \param dg In: The inner product between drt and grad. 41 | /// Out: The inner product between drt and the new gradient. 42 | /// \param x Out: The new point moved to. 43 | /// 44 | template 45 | static void LineSearch(Foo& f, const LBFGSParam& param, 46 | const Vector& xp, const Vector& drt, const Scalar& step_max, 47 | Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x) 48 | { 49 | // Decreasing and increasing factors 50 | const Scalar dec = 0.5; 51 | const Scalar inc = 2.1; 52 | 53 | // Check the value of step 54 | if (step <= Scalar(0)) 55 | throw std::invalid_argument("'step' must be positive"); 56 | 57 | // Save the function value at the current x 58 | const Scalar fx_init = fx; 59 | // Projection of gradient on the search direction 60 | const Scalar dg_init = grad.dot(drt); 61 | // Make sure d points to a descent direction 62 | if (dg_init > 0) 63 | throw std::logic_error("the moving direction increases the objective function value"); 64 | 65 | const Scalar test_decr = param.ftol * dg_init; 66 | Scalar width; 67 | 68 | int iter; 69 | for (iter = 0; iter < param.max_linesearch; iter++) 70 | { 71 | // x_{k+1} = x_k + step * d_k 72 | x.noalias() = xp + step * drt; 73 | // Evaluate this candidate 74 | fx = f(x, grad); 75 | 76 | if (fx > fx_init + step * test_decr || (fx != fx)) 77 | { 78 | width = dec; 79 | } 80 | else 81 | { 82 | dg = grad.dot(drt); 83 | 84 | // Armijo condition is met 85 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) 86 | break; 87 | 88 | if (dg < param.wolfe * dg_init) 89 | { 90 | width = inc; 91 | } 92 | else 93 | { 94 | // Regular Wolfe condition is met 95 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) 96 | break; 97 | 98 | if (dg > -param.wolfe * dg_init) 99 | { 100 | width = dec; 101 | } 102 | else 103 | { 104 | // Strong Wolfe condition is met 105 | break; 106 | } 107 | } 108 | } 109 | 110 | if (step < param.min_step) 111 | throw std::runtime_error("the line search step became smaller than the minimum value allowed"); 112 | 113 | if (step > param.max_step) 114 | throw std::runtime_error("the line search step became larger than the maximum value allowed"); 115 | 116 | step *= width; 117 | } 118 | 119 | if (iter >= param.max_linesearch) 120 | throw std::runtime_error("the line search routine reached the maximum number of iterations"); 121 | } 122 | }; 123 | 124 | } // namespace LBFGSpp 125 | 126 | #endif // LBFGSPP_LINE_SEARCH_BACKTRACKING_H 127 | -------------------------------------------------------------------------------- /include/LBFGSpp/LineSearchBracketing.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2025 Yixuan Qiu 2 | // Copyright (C) 2016-2025 Dirk Toewe 3 | // Under MIT license 4 | 5 | #ifndef LBFGSPP_LINE_SEARCH_BRACKETING_H 6 | #define LBFGSPP_LINE_SEARCH_BRACKETING_H 7 | 8 | #include 9 | #include // std::runtime_error 10 | #include "Param.h" 11 | 12 | namespace LBFGSpp { 13 | 14 | /// 15 | /// The bracketing line search algorithm for L-BFGS. Mainly for internal use. 16 | /// 17 | template 18 | class LineSearchBracketing 19 | { 20 | private: 21 | using Vector = Eigen::Matrix; 22 | 23 | public: 24 | /// 25 | /// Line search by bracketing. Similar to the backtracking line search 26 | /// except that it actively maintains an upper and lower bound of the 27 | /// current search range. 28 | /// 29 | /// \param f A function object such that `f(x, grad)` returns the 30 | /// objective function value at `x`, and overwrites `grad` with 31 | /// the gradient. 32 | /// \param param Parameters for the L-BFGS algorithm. 33 | /// \param xp The current point. 34 | /// \param drt The current moving direction. 35 | /// \param step_max The upper bound for the step size that makes x feasible. 36 | /// Can be ignored for the L-BFGS solver. 37 | /// \param step In: The initial step length. 38 | /// Out: The calculated step length. 39 | /// \param fx In: The objective function value at the current point. 40 | /// Out: The function value at the new point. 41 | /// \param grad In: The current gradient vector. 42 | /// Out: The gradient at the new point. 43 | /// \param dg In: The inner product between drt and grad. 44 | /// Out: The inner product between drt and the new gradient. 45 | /// \param x Out: The new point moved to. 46 | /// 47 | template 48 | static void LineSearch(Foo& f, const LBFGSParam& param, 49 | const Vector& xp, const Vector& drt, const Scalar& step_max, 50 | Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x) 51 | { 52 | // Check the value of step 53 | if (step <= Scalar(0)) 54 | throw std::invalid_argument("'step' must be positive"); 55 | 56 | // Save the function value at the current x 57 | const Scalar fx_init = fx; 58 | // Projection of gradient on the search direction 59 | const Scalar dg_init = grad.dot(drt); 60 | // Make sure d points to a descent direction 61 | if (dg_init > 0) 62 | throw std::logic_error("the moving direction increases the objective function value"); 63 | 64 | const Scalar test_decr = param.ftol * dg_init; 65 | 66 | // Upper and lower end of the current line search range 67 | Scalar step_lo = 0, 68 | step_hi = std::numeric_limits::infinity(); 69 | 70 | int iter; 71 | for (iter = 0; iter < param.max_linesearch; iter++) 72 | { 73 | // x_{k+1} = x_k + step * d_k 74 | x.noalias() = xp + step * drt; 75 | // Evaluate this candidate 76 | fx = f(x, grad); 77 | 78 | if (fx > fx_init + step * test_decr || (fx != fx)) 79 | { 80 | step_hi = step; 81 | } 82 | else 83 | { 84 | dg = grad.dot(drt); 85 | 86 | // Armijo condition is met 87 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO) 88 | break; 89 | 90 | if (dg < param.wolfe * dg_init) 91 | { 92 | step_lo = step; 93 | } 94 | else 95 | { 96 | // Regular Wolfe condition is met 97 | if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE) 98 | break; 99 | 100 | if (dg > -param.wolfe * dg_init) 101 | { 102 | step_hi = step; 103 | } 104 | else 105 | { 106 | // Strong Wolfe condition is met 107 | break; 108 | } 109 | } 110 | } 111 | 112 | assert(step_lo < step_hi); 113 | 114 | if (step < param.min_step) 115 | throw std::runtime_error("the line search step became smaller than the minimum value allowed"); 116 | 117 | if (step > param.max_step) 118 | throw std::runtime_error("the line search step became larger than the maximum value allowed"); 119 | 120 | // continue search in mid of current search range 121 | step = std::isinf(step_hi) ? 2 * step : step_lo / 2 + step_hi / 2; 122 | } 123 | 124 | if (iter >= param.max_linesearch) 125 | throw std::runtime_error("the line search routine reached the maximum number of iterations"); 126 | } 127 | }; 128 | 129 | } // namespace LBFGSpp 130 | 131 | #endif // LBFGSPP_LINE_SEARCH_BRACKETING_H 132 | -------------------------------------------------------------------------------- /include/LBFGSpp/LineSearchMoreThuente.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_LINE_SEARCH_MORE_THUENTE_H 5 | #define LBFGSPP_LINE_SEARCH_MORE_THUENTE_H 6 | 7 | #include 8 | #include // std::invalid_argument, std::runtime_error 9 | #include "Param.h" 10 | 11 | namespace LBFGSpp { 12 | 13 | /// 14 | /// The line search algorithm by Moré and Thuente (1994), currently used for the L-BFGS-B algorithm. 15 | /// 16 | /// The target of this line search algorithm is to find a step size \f$\alpha\f$ that satisfies the strong Wolfe condition 17 | /// \f$f(x+\alpha d) \le f(x) + \alpha\mu g(x)^T d\f$ and \f$|g(x+\alpha d)^T d| \le \eta|g(x)^T d|\f$. 18 | /// Our implementation is a simplified version of the algorithm in [1]. We assume that \f$0<\mu<\eta<1\f$, while in [1] 19 | /// they do not assume \f$\eta>\mu\f$. As a result, the algorithm in [1] has two stages, but in our implementation we 20 | /// only need the first stage to guarantee the convergence. 21 | /// 22 | /// Reference: 23 | /// [1] Moré, J. J., & Thuente, D. J. (1994). Line search algorithms with guaranteed sufficient decrease. 24 | /// 25 | template 26 | class LineSearchMoreThuente 27 | { 28 | private: 29 | using Vector = Eigen::Matrix; 30 | 31 | // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2 32 | // that interpolates fa, ga, and fb, assuming the minimizer exists 33 | // For case I: fb >= fa and ga * (b - a) < 0 34 | static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& ga, const Scalar& fb) 35 | { 36 | const Scalar ba = b - a; 37 | const Scalar w = Scalar(0.5) * ba * ga / (fa - fb + ba * ga); 38 | return a + w * ba; 39 | } 40 | 41 | // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2 42 | // that interpolates fa, ga and gb, assuming the minimizer exists 43 | // The result actually does not depend on fa 44 | // For case II: ga * (b - a) < 0, ga * gb < 0 45 | // For case III: ga * (b - a) < 0, ga * ga >= 0, |gb| <= |ga| 46 | static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& ga, const Scalar& gb) 47 | { 48 | const Scalar w = ga / (ga - gb); 49 | return a + w * (b - a); 50 | } 51 | 52 | // Local minimizer of a cubic function q(x) = c0 + c1 * x + c2 * x^2 + c3 * x^3 53 | // that interpolates fa, ga, fb and gb, assuming a != b 54 | // Also sets a flag indicating whether the minimizer exists 55 | static Scalar cubic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& fb, 56 | const Scalar& ga, const Scalar& gb, bool& exists) 57 | { 58 | using std::abs; 59 | using std::sqrt; 60 | 61 | const Scalar apb = a + b; 62 | const Scalar ba = b - a; 63 | const Scalar ba2 = ba * ba; 64 | const Scalar fba = fb - fa; 65 | const Scalar gba = gb - ga; 66 | // z3 = c3 * (b-a)^3, z2 = c2 * (b-a)^3, z1 = c1 * (b-a)^3 67 | const Scalar z3 = (ga + gb) * ba - Scalar(2) * fba; 68 | const Scalar z2 = Scalar(0.5) * (gba * ba2 - Scalar(3) * apb * z3); 69 | const Scalar z1 = fba * ba2 - apb * z2 - (a * apb + b * b) * z3; 70 | // std::cout << "z1 = " << z1 << ", z2 = " << z2 << ", z3 = " << z3 << std::endl; 71 | 72 | // If c3 = z/(b-a)^3 == 0, reduce to quadratic problem 73 | const Scalar eps = std::numeric_limits::epsilon(); 74 | if (abs(z3) < eps * abs(z2) || abs(z3) < eps * abs(z1)) 75 | { 76 | // Minimizer exists if c2 > 0 77 | exists = (z2 * ba > Scalar(0)); 78 | // Return the end point if the minimizer does not exist 79 | return exists ? (-Scalar(0.5) * z1 / z2) : b; 80 | } 81 | 82 | // Now we can assume z3 > 0 83 | // The minimizer is a solution to the equation c1 + 2*c2 * x + 3*c3 * x^2 = 0 84 | // roots = -(z2/z3) / 3 (+-) sqrt((z2/z3)^2 - 3 * (z1/z3)) / 3 85 | // 86 | // Let u = z2/(3z3) and v = z1/z2 87 | // The minimizer exists if v/u <= 1 88 | const Scalar u = z2 / (Scalar(3) * z3), v = z1 / z2; 89 | const Scalar vu = v / u; 90 | exists = (vu <= Scalar(1)); 91 | if (!exists) 92 | return b; 93 | 94 | // We need to find a numerically stable way to compute the roots, as z3 may still be small 95 | // 96 | // If |u| >= |v|, let w = 1 + sqrt(1-v/u), and then 97 | // r1 = -u * w, r2 = -v / w, r1 does not need to be the smaller one 98 | // 99 | // If |u| < |v|, we must have uv <= 0, and then 100 | // r = -u (+-) sqrt(delta), where 101 | // sqrt(delta) = sqrt(|u|) * sqrt(|v|) * sqrt(1-u/v) 102 | Scalar r1 = Scalar(0), r2 = Scalar(0); 103 | if (abs(u) >= abs(v)) 104 | { 105 | const Scalar w = Scalar(1) + sqrt(Scalar(1) - vu); 106 | r1 = -u * w; 107 | r2 = -v / w; 108 | } 109 | else 110 | { 111 | const Scalar sqrtd = sqrt(abs(u)) * sqrt(abs(v)) * sqrt(1 - u / v); 112 | r1 = -u - sqrtd; 113 | r2 = -u + sqrtd; 114 | } 115 | return (z3 * ba > Scalar(0)) ? ((std::max)(r1, r2)) : ((std::min)(r1, r2)); 116 | } 117 | 118 | // Select the next step size according to the current step sizes, 119 | // function values, and derivatives 120 | static Scalar step_selection( 121 | const Scalar& al, const Scalar& au, const Scalar& at, 122 | const Scalar& fl, const Scalar& fu, const Scalar& ft, 123 | const Scalar& gl, const Scalar& gu, const Scalar& gt) 124 | { 125 | using std::abs; 126 | 127 | if (al == au) 128 | return al; 129 | 130 | // If ft = Inf or gt = Inf, we return the middle point of al and at 131 | if (!std::isfinite(ft) || !std::isfinite(gt)) 132 | return (al + at) / Scalar(2); 133 | 134 | // ac: cubic interpolation of fl, ft, gl, gt 135 | // aq: quadratic interpolation of fl, gl, ft 136 | bool ac_exists; 137 | // std::cout << "al = " << al << ", at = " << at << ", fl = " << fl << ", ft = " << ft << ", gl = " << gl << ", gt = " << gt << std::endl; 138 | const Scalar ac = cubic_minimizer(al, at, fl, ft, gl, gt, ac_exists); 139 | const Scalar aq = quadratic_minimizer(al, at, fl, gl, ft); 140 | // std::cout << "ac = " << ac << ", aq = " << aq << std::endl; 141 | // Case 1: ft > fl 142 | if (ft > fl) 143 | { 144 | // This should not happen if ft > fl, but just to be safe 145 | if (!ac_exists) 146 | return aq; 147 | // Then use the scheme described in the paper 148 | return (abs(ac - al) < abs(aq - al)) ? ac : ((aq + ac) / Scalar(2)); 149 | } 150 | 151 | // as: quadratic interpolation of gl and gt 152 | const Scalar as = quadratic_minimizer(al, at, gl, gt); 153 | // Case 2: ft <= fl, gt * gl < 0 154 | if (gt * gl < Scalar(0)) 155 | return (abs(ac - at) >= abs(as - at)) ? ac : as; 156 | 157 | // Case 3: ft <= fl, gt * gl >= 0, |gt| < |gl| 158 | const Scalar deltal = Scalar(1.1), deltau = Scalar(0.66); 159 | if (abs(gt) < abs(gl)) 160 | { 161 | // We choose either ac or as 162 | // The case for ac: 1. It exists, and 163 | // 2. ac is farther than at from al, and 164 | // 3. ac is closer to at than as 165 | // Cases for as: otherwise 166 | const Scalar res = (ac_exists && 167 | (ac - at) * (at - al) > Scalar(0) && 168 | abs(ac - at) < abs(as - at)) ? 169 | ac : 170 | as; 171 | // Postprocessing the chosen step 172 | return (at > al) ? 173 | std::min(at + deltau * (au - at), res) : 174 | std::max(at + deltau * (au - at), res); 175 | } 176 | 177 | // Simple extrapolation if au, fu, or gu is infinity 178 | if ((!std::isfinite(au)) || (!std::isfinite(fu)) || (!std::isfinite(gu))) 179 | return at + deltal * (at - al); 180 | 181 | // ae: cubic interpolation of ft, fu, gt, gu 182 | bool ae_exists; 183 | const Scalar ae = cubic_minimizer(at, au, ft, fu, gt, gu, ae_exists); 184 | // Case 4: ft <= fl, gt * gl >= 0, |gt| >= |gl| 185 | // The following is not used in the paper, but it seems to be a reasonable safeguard 186 | return (at > al) ? 187 | std::min(at + deltau * (au - at), ae) : 188 | std::max(at + deltau * (au - at), ae); 189 | } 190 | 191 | public: 192 | /// 193 | /// Line search by Moré and Thuente (1994). 194 | /// 195 | /// \param f A function object such that `f(x, grad)` returns the 196 | /// objective function value at `x`, and overwrites `grad` with 197 | /// the gradient. 198 | /// \param param An `LBFGSParam` or `LBFGSBParam` object that stores the 199 | /// parameters of the solver. 200 | /// \param xp The current point. 201 | /// \param drt The current moving direction. 202 | /// \param step_max The upper bound for the step size that makes x feasible. 203 | /// \param step In: The initial step length. 204 | /// Out: The calculated step length. 205 | /// \param fx In: The objective function value at the current point. 206 | /// Out: The function value at the new point. 207 | /// \param grad In: The current gradient vector. 208 | /// Out: The gradient at the new point. 209 | /// \param dg In: The inner product between drt and grad. 210 | /// Out: The inner product between drt and the new gradient. 211 | /// \param x Out: The new point moved to. 212 | /// 213 | template 214 | static void LineSearch(Foo& f, const SolverParam& param, 215 | const Vector& xp, const Vector& drt, const Scalar& step_max, 216 | Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x) 217 | { 218 | using std::abs; 219 | // std::cout << "========================= Entering line search =========================\n\n"; 220 | 221 | // Check the value of step 222 | if (step <= Scalar(0)) 223 | throw std::invalid_argument("'step' must be positive"); 224 | if (step > step_max) 225 | throw std::invalid_argument("'step' exceeds 'step_max'"); 226 | 227 | // Save the function value at the current x 228 | const Scalar fx_init = fx; 229 | // Projection of gradient on the search direction 230 | const Scalar dg_init = dg; 231 | 232 | // std::cout << "fx_init = " << fx_init << ", dg_init = " << dg_init << std::endl << std::endl; 233 | 234 | // Make sure d points to a descent direction 235 | if (dg_init >= Scalar(0)) 236 | throw std::logic_error("the moving direction does not decrease the objective function value"); 237 | 238 | // Tolerance for convergence test 239 | // Sufficient decrease 240 | const Scalar test_decr = param.ftol * dg_init; 241 | // Curvature 242 | const Scalar test_curv = -param.wolfe * dg_init; 243 | 244 | // The bracketing interval 245 | Scalar I_lo = Scalar(0), I_hi = std::numeric_limits::infinity(); 246 | Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits::infinity(); 247 | Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits::infinity(); 248 | // We also need to save x and grad for step=I_lo, since we want to return the best 249 | // step size along the path when strong Wolfe condition is not met 250 | Vector x_lo = xp, grad_lo = grad; 251 | Scalar fx_lo = fx_init, dg_lo = dg_init; 252 | 253 | // Function value and gradient at the current step size 254 | x.noalias() = xp + step * drt; 255 | fx = f(x, grad); 256 | dg = grad.dot(drt); 257 | 258 | // std::cout << "max_step = " << step_max << ", step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl; 259 | 260 | // Convergence test 261 | if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv) 262 | { 263 | // std::cout << "** Criteria met\n\n"; 264 | // std::cout << "========================= Leaving line search =========================\n\n"; 265 | return; 266 | } 267 | 268 | // Extrapolation factor 269 | const Scalar delta = Scalar(1.1); 270 | int iter; 271 | for (iter = 0; iter < param.max_linesearch; iter++) 272 | { 273 | // ft = psi(step) = f(xp + step * drt) - f(xp) - step * test_decr 274 | // gt = psi'(step) = dg - mu * dg_init 275 | // mu = param.ftol 276 | const Scalar ft = fx - fx_init - step * test_decr; 277 | const Scalar gt = dg - param.ftol * dg_init; 278 | 279 | // Update step size and bracketing interval 280 | Scalar new_step; 281 | if (ft > fI_lo) 282 | { 283 | // Case 1: ft > fl 284 | new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt); 285 | // Sanity check: if the computed new_step is too small, typically due to 286 | // extremely large value of ft, switch to the middle point 287 | if (new_step <= param.min_step) 288 | new_step = (I_lo + step) / Scalar(2); 289 | 290 | I_hi = step; 291 | fI_hi = ft; 292 | gI_hi = gt; 293 | 294 | // std::cout << "Case 1: new step = " << new_step << std::endl; 295 | } 296 | else if (gt * (I_lo - step) > Scalar(0)) 297 | { 298 | // Case 2: ft <= fl, gt * (al - at) > 0 299 | // 300 | // Page 291 of Moré and Thuente (1994) suggests that 301 | // newat = min(at + delta * (at - al), amax), delta in [1.1, 4] 302 | new_step = std::min(step_max, step + delta * (step - I_lo)); 303 | 304 | // We can also consider the following scheme: 305 | // First let step_selection() decide a value, and then project to the range above 306 | // 307 | // new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt); 308 | // const Scalar delta2 = Scalar(4) 309 | // const Scalar t1 = step + delta * (step - I_lo); 310 | // const Scalar t2 = step + delta2 * (step - I_lo); 311 | // const Scalar tl = std::min(t1, t2), tu = std::max(t1, t2); 312 | // new_step = std::min(tu, std::max(tl, new_step)); 313 | // new_step = std::min(step_max, new_step); 314 | 315 | I_lo = step; 316 | fI_lo = ft; 317 | gI_lo = gt; 318 | // Move x and grad to x_lo and grad_lo, respectively 319 | x_lo.swap(x); 320 | grad_lo.swap(grad); 321 | fx_lo = fx; 322 | dg_lo = dg; 323 | 324 | // std::cout << "Case 2: new step = " << new_step << std::endl; 325 | } 326 | else 327 | { 328 | // Case 3: ft <= fl, gt * (al - at) <= 0 329 | new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt); 330 | 331 | I_hi = I_lo; 332 | fI_hi = fI_lo; 333 | gI_hi = gI_lo; 334 | 335 | I_lo = step; 336 | fI_lo = ft; 337 | gI_lo = gt; 338 | // Move x and grad to x_lo and grad_lo, respectively 339 | x_lo.swap(x); 340 | grad_lo.swap(grad); 341 | fx_lo = fx; 342 | dg_lo = dg; 343 | 344 | // std::cout << "Case 3: new step = " << new_step << std::endl; 345 | } 346 | 347 | // Case 1 and 3 are interpolations, whereas Case 2 is extrapolation 348 | // This means that Case 2 may return new_step = step_max, 349 | // and we need to decide whether to accept this value 350 | // 1. If both step and new_step equal to step_max, it means 351 | // step will have no further change, so we accept it 352 | // 2. Otherwise, we need to test the function value and gradient 353 | // on step_max, and decide later 354 | 355 | // In case step, new_step, and step_max are equal, directly return the computed x and fx 356 | if (step == step_max && new_step >= step_max) 357 | { 358 | // std::cout << "** Maximum step size reached\n\n"; 359 | // std::cout << "========================= Leaving line search =========================\n\n"; 360 | 361 | // Move {x, grad}_lo back before returning 362 | x.swap(x_lo); 363 | grad.swap(grad_lo); 364 | return; 365 | } 366 | // Otherwise, recompute x and fx based on new_step 367 | step = new_step; 368 | 369 | if (step < param.min_step) 370 | throw std::runtime_error("the line search step became smaller than the minimum value allowed"); 371 | 372 | if (step > param.max_step) 373 | throw std::runtime_error("the line search step became larger than the maximum value allowed"); 374 | 375 | // Update parameter, function value, and gradient 376 | x.noalias() = xp + step * drt; 377 | fx = f(x, grad); 378 | dg = grad.dot(drt); 379 | 380 | // std::cout << "step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl; 381 | 382 | // Convergence test 383 | if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv) 384 | { 385 | // std::cout << "** Criteria met\n\n"; 386 | // std::cout << "========================= Leaving line search =========================\n\n"; 387 | return; 388 | } 389 | 390 | // Now assume step = step_max, and we need to decide whether to 391 | // exit the line search (see the comments above regarding step_max) 392 | // If we reach here, it means this step size does not pass the convergence 393 | // test, so either the sufficient decrease condition or the curvature 394 | // condition is not met yet 395 | // 396 | // Typically the curvature condition is harder to meet, and it is 397 | // possible that no step size in [0, step_max] satisfies the condition 398 | // 399 | // But we need to make sure that its psi function value is smaller than 400 | // the best one so far. If not, go to the next iteration and find a better one 401 | if (step >= step_max) 402 | { 403 | const Scalar ft_bound = fx - fx_init - step * test_decr; 404 | if (ft_bound <= fI_lo) 405 | { 406 | // std::cout << "** Maximum step size reached\n\n"; 407 | // std::cout << "========================= Leaving line search =========================\n\n"; 408 | return; 409 | } 410 | } 411 | } 412 | 413 | // If we have used up all line search iterations, then the strong Wolfe condition 414 | // is not met. We choose not to raise an exception (unless no step satisfying 415 | // sufficient decrease is found), but to return the best step size so far 416 | if (iter >= param.max_linesearch) 417 | { 418 | // throw std::runtime_error("the line search routine reached the maximum number of iterations"); 419 | 420 | // First test whether the last step is better than I_lo 421 | // If yes, return the last step 422 | const Scalar ft = fx - fx_init - step * test_decr; 423 | if (ft <= fI_lo) 424 | return; 425 | 426 | // If not, then the best step size so far is I_lo, but it needs to be positive 427 | if (I_lo <= Scalar(0)) 428 | throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value"); 429 | 430 | // Return everything with _lo 431 | step = I_lo; 432 | fx = fx_lo; 433 | dg = dg_lo; 434 | // Move {x, grad}_lo back 435 | x.swap(x_lo); 436 | grad.swap(grad_lo); 437 | return; 438 | } 439 | } 440 | }; 441 | 442 | } // namespace LBFGSpp 443 | 444 | #endif // LBFGSPP_LINE_SEARCH_MORE_THUENTE_H 445 | -------------------------------------------------------------------------------- /include/LBFGSpp/LineSearchNocedalWright.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2025 Yixuan Qiu 2 | // Copyright (C) 2016-2025 Dirk Toewe 3 | // Under MIT license 4 | 5 | #ifndef LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H 6 | #define LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H 7 | 8 | #include 9 | #include 10 | #include "Param.h" 11 | 12 | namespace LBFGSpp { 13 | 14 | /// 15 | /// A line search algorithm for the strong Wolfe condition. Implementation based on: 16 | /// 17 | /// "Numerical Optimization" 2nd Edition, 18 | /// Jorge Nocedal and Stephen J. Wright, 19 | /// Chapter 3. Line Search Methods, page 60. 20 | /// 21 | template 22 | class LineSearchNocedalWright 23 | { 24 | private: 25 | using Vector = Eigen::Matrix; 26 | 27 | // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of 28 | // the function, and the fitted quadratic function is used to 29 | // estimate the minimum 30 | static Scalar quad_interp(const Scalar& step_lo, const Scalar& step_hi, 31 | const Scalar& fx_lo, const Scalar& fx_hi, const Scalar& dg_lo) 32 | { 33 | using std::abs; 34 | 35 | // polynomial: p (x) = c0*(x - step)² + c1 36 | // conditions: p (step_hi) = fx_hi 37 | // p (step_lo) = fx_lo 38 | // p'(step_lo) = dg_lo 39 | 40 | // We allow fx_hi to be Inf, so first compute a candidate for step size, 41 | // and test whether NaN occurs 42 | const Scalar fdiff = fx_hi - fx_lo; 43 | const Scalar sdiff = step_hi - step_lo; 44 | const Scalar smid = (step_hi + step_lo) / Scalar(2); 45 | Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo; 46 | step_candid = step_candid / (fdiff - sdiff * dg_lo); 47 | 48 | // In some cases the interpolation is not a good choice 49 | // This includes (a) NaN values; (b) too close to the end points; (c) outside the interval 50 | // In such cases, a bisection search is used 51 | const bool candid_nan = !(std::isfinite(step_candid)); 52 | const Scalar end_dist = std::min(abs(step_candid - step_lo), abs(step_candid - step_hi)); 53 | const bool near_end = end_dist < Scalar(0.01) * abs(sdiff); 54 | const bool bisect = candid_nan || 55 | (step_candid <= std::min(step_lo, step_hi)) || 56 | (step_candid >= std::max(step_lo, step_hi)) || 57 | near_end; 58 | const Scalar step = bisect ? smid : step_candid; 59 | return step; 60 | } 61 | 62 | public: 63 | /// 64 | /// Line search by Nocedal and Wright (2006). 65 | /// 66 | /// \param f A function object such that `f(x, grad)` returns the 67 | /// objective function value at `x`, and overwrites `grad` with 68 | /// the gradient. 69 | /// \param param Parameters for the L-BFGS algorithm. 70 | /// \param xp The current point. 71 | /// \param drt The current moving direction. 72 | /// \param step_max The upper bound for the step size that makes x feasible. 73 | /// Can be ignored for the L-BFGS solver. 74 | /// \param step In: The initial step length. 75 | /// Out: The calculated step length. 76 | /// \param fx In: The objective function value at the current point. 77 | /// Out: The function value at the new point. 78 | /// \param grad In: The current gradient vector. 79 | /// Out: The gradient at the new point. 80 | /// \param dg In: The inner product between drt and grad. 81 | /// Out: The inner product between drt and the new gradient. 82 | /// \param x Out: The new point moved to. 83 | /// 84 | template 85 | static void LineSearch(Foo& f, const LBFGSParam& param, 86 | const Vector& xp, const Vector& drt, const Scalar& step_max, 87 | Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x) 88 | { 89 | // Check the value of step 90 | if (step <= Scalar(0)) 91 | throw std::invalid_argument("'step' must be positive"); 92 | 93 | if (param.linesearch != LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) 94 | throw std::invalid_argument("'param.linesearch' must be 'LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE' for LineSearchNocedalWright"); 95 | 96 | // To make this implementation more similar to the other line search 97 | // methods in LBFGSpp, the symbol names from the literature 98 | // ("Numerical Optimizations") have been changed. 99 | // 100 | // Literature | LBFGSpp 101 | // -----------|-------- 102 | // alpha | step 103 | // phi | fx 104 | // phi' | dg 105 | 106 | // The expansion rate of the step size 107 | const Scalar expansion = Scalar(2); 108 | 109 | // Save the function value at the current x 110 | const Scalar fx_init = fx; 111 | // Projection of gradient on the search direction 112 | const Scalar dg_init = dg; 113 | // Make sure d points to a descent direction 114 | if (dg_init > Scalar(0)) 115 | throw std::logic_error("the moving direction increases the objective function value"); 116 | 117 | const Scalar test_decr = param.ftol * dg_init, // Sufficient decrease 118 | test_curv = -param.wolfe * dg_init; // Curvature 119 | 120 | // Ends of the line search range (step_lo > step_hi is allowed) 121 | // We can also define dg_hi, but it will never be used 122 | Scalar step_hi, fx_hi; 123 | Scalar step_lo = Scalar(0), fx_lo = fx_init, dg_lo = dg_init; 124 | // We also need to save x and grad for step=step_lo, since we want to return the best 125 | // step size along the path when strong Wolfe condition is not met 126 | Vector x_lo = xp, grad_lo = grad; 127 | 128 | // STEP 1: Bracketing Phase 129 | // Find a range guaranteed to contain a step satisfying strong Wolfe. 130 | // The bracketing phase exits if one of the following conditions is satisfied: 131 | // (1) Current step violates the sufficient decrease condition 132 | // (2) Current fx >= previous fx 133 | // (3) Current dg >= 0 134 | // (4) Strong Wolfe condition is met 135 | // 136 | // (4) terminates the whole line search, and (1)-(3) go to the zoom phase 137 | // 138 | // See also: 139 | // "Numerical Optimization", "Algorithm 3.5 (Line Search Algorithm)". 140 | int iter = 0; 141 | for (;;) 142 | { 143 | // Evaluate the current step size 144 | x.noalias() = xp + step * drt; 145 | fx = f(x, grad); 146 | dg = grad.dot(drt); 147 | 148 | // Test the sufficient decrease condition 149 | if (fx - fx_init > step * test_decr || (Scalar(0) < step_lo && fx >= fx_lo)) 150 | { 151 | // Case (1) and (2) 152 | step_hi = step; 153 | fx_hi = fx; 154 | // dg_hi = dg; 155 | break; 156 | } 157 | // If reaching here, then the sufficient decrease condition is satisfied 158 | 159 | // Test the curvature condition 160 | if (std::abs(dg) <= test_curv) 161 | return; // Case (4) 162 | 163 | step_hi = step_lo; 164 | fx_hi = fx_lo; 165 | // dg_hi = dg_lo; 166 | step_lo = step; 167 | fx_lo = fx; 168 | dg_lo = dg; 169 | // Move x and grad to x_lo and grad_lo, respectively 170 | x_lo.swap(x); 171 | grad_lo.swap(grad); 172 | 173 | if (dg >= Scalar(0)) 174 | break; // Case (3) 175 | 176 | iter++; 177 | // If we have used up all line search iterations in the bracketing phase, 178 | // it means every new step decreases the objective function. Of course, 179 | // the strong Wolfe condition is not met, but we choose not to raise an 180 | // exception; instead, we return the best step size so far. This means that 181 | // we exit the line search with the most recent step size, which has the 182 | // smallest objective function value during the line search 183 | if (iter >= param.max_linesearch) 184 | { 185 | // throw std::runtime_error("the line search routine reached the maximum number of iterations"); 186 | 187 | // At this point we can guarantee that {step, fx, dg}=={step, fx, dg}_lo 188 | // But we need to move {x, grad}_lo back before returning 189 | x.swap(x_lo); 190 | grad.swap(grad_lo); 191 | return; 192 | } 193 | 194 | // If we still stay in the loop, it means we can expand the current step 195 | step *= expansion; 196 | } 197 | 198 | // STEP 2: Zoom Phase 199 | // Given a range (step_lo,step_hi) that is guaranteed to 200 | // contain a valid strong Wolfe step value, this method 201 | // finds such a value. 202 | // 203 | // If step_lo > 0, then step_lo is, among all step sizes generated so far and 204 | // satisfying the sufficient decrease condition, the one giving the smallest 205 | // objective function value. 206 | // 207 | // See also: 208 | // "Numerical Optimization", "Algorithm 3.6 (Zoom)". 209 | for (;;) 210 | { 211 | // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of 212 | // the function, and the fitted quadratic function is used to 213 | // estimate the minimum 214 | step = quad_interp(step_lo, step_hi, fx_lo, fx_hi, dg_lo); 215 | 216 | // Evaluate the current step size 217 | x.noalias() = xp + step * drt; 218 | fx = f(x, grad); 219 | dg = grad.dot(drt); 220 | 221 | // Test the sufficient decrease condition 222 | if (fx - fx_init > step * test_decr || fx >= fx_lo) 223 | { 224 | if (step == step_hi) 225 | throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision"); 226 | 227 | step_hi = step; 228 | fx_hi = fx; 229 | // dg_hi = dg; 230 | } 231 | else 232 | { 233 | // Test the curvature condition 234 | if (std::abs(dg) <= test_curv) 235 | return; 236 | 237 | if (dg * (step_hi - step_lo) >= Scalar(0)) 238 | { 239 | step_hi = step_lo; 240 | fx_hi = fx_lo; 241 | // dg_hi = dg_lo; 242 | } 243 | 244 | if (step == step_lo) 245 | throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision"); 246 | 247 | // If reaching here, then the current step satisfies sufficient decrease condition 248 | step_lo = step; 249 | fx_lo = fx; 250 | dg_lo = dg; 251 | // Move x and grad to x_lo and grad_lo, respectively 252 | x_lo.swap(x); 253 | grad_lo.swap(grad); 254 | } 255 | 256 | iter++; 257 | // If we have used up all line search iterations in the zoom phase, 258 | // then the strong Wolfe condition is not met. We choose not to raise an 259 | // exception (unless no step satisfying sufficient decrease is found), 260 | // but to return the best step size so far, i.e., step_lo 261 | if (iter >= param.max_linesearch) 262 | { 263 | // throw std::runtime_error("the line search routine reached the maximum number of iterations"); 264 | if (step_lo <= Scalar(0)) 265 | throw std::runtime_error("the line search routine failed, unable to sufficiently decrease the function value"); 266 | 267 | // Return everything with _lo 268 | step = step_lo; 269 | fx = fx_lo; 270 | dg = dg_lo; 271 | // Move {x, grad}_lo back 272 | x.swap(x_lo); 273 | grad.swap(grad_lo); 274 | return; 275 | } 276 | } 277 | } 278 | }; 279 | 280 | } // namespace LBFGSpp 281 | 282 | #endif // LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H 283 | -------------------------------------------------------------------------------- /include/LBFGSpp/Param.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2016-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_PARAM_H 5 | #define LBFGSPP_PARAM_H 6 | 7 | #include 8 | #include // std::invalid_argument 9 | 10 | namespace LBFGSpp { 11 | 12 | /// 13 | /// \defgroup Enumerations 14 | /// 15 | /// Enumeration types for line search. 16 | /// 17 | 18 | /// 19 | /// \ingroup Enumerations 20 | /// 21 | /// The enumeration of line search termination conditions. 22 | /// 23 | enum LINE_SEARCH_TERMINATION_CONDITION 24 | { 25 | /// 26 | /// Backtracking method with the Armijo condition. 27 | /// The backtracking method finds the step length such that it satisfies 28 | /// the sufficient decrease (Armijo) condition, 29 | /// \f$f(x + a \cdot d) \le f(x) + \beta' \cdot a \cdot g(x)^T d\f$, 30 | /// where \f$x\f$ is the current point, \f$d\f$ is the current search direction, 31 | /// \f$a\f$ is the step length, and \f$\beta'\f$ is the value specified by 32 | /// \ref LBFGSParam::ftol. \f$f\f$ and \f$g\f$ are the function 33 | /// and gradient values respectively. 34 | /// 35 | LBFGS_LINESEARCH_BACKTRACKING_ARMIJO = 1, 36 | 37 | /// 38 | /// The backtracking method with the defualt (regular Wolfe) condition. 39 | /// An alias of `LBFGS_LINESEARCH_BACKTRACKING_WOLFE`. 40 | /// 41 | LBFGS_LINESEARCH_BACKTRACKING = 2, 42 | 43 | /// 44 | /// Backtracking method with regular Wolfe condition. 45 | /// The backtracking method finds the step length such that it satisfies 46 | /// both the Armijo condition (`LBFGS_LINESEARCH_BACKTRACKING_ARMIJO`) 47 | /// and the curvature condition, 48 | /// \f$g(x + a \cdot d)^T d \ge \beta \cdot g(x)^T d\f$, where \f$\beta\f$ 49 | /// is the value specified by \ref LBFGSParam::wolfe. 50 | /// 51 | LBFGS_LINESEARCH_BACKTRACKING_WOLFE = 2, 52 | 53 | /// 54 | /// Backtracking method with strong Wolfe condition. 55 | /// The backtracking method finds the step length such that it satisfies 56 | /// both the Armijo condition (`LBFGS_LINESEARCH_BACKTRACKING_ARMIJO`) 57 | /// and the following condition, 58 | /// \f$\vert g(x + a \cdot d)^T d\vert \le \beta \cdot \vert g(x)^T d\vert\f$, 59 | /// where \f$\beta\f$ is the value specified by \ref LBFGSParam::wolfe. 60 | /// 61 | LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 3 62 | }; 63 | 64 | /// 65 | /// Parameters to control the L-BFGS algorithm. 66 | /// 67 | template 68 | class LBFGSParam 69 | { 70 | public: 71 | /// 72 | /// The number of corrections to approximate the inverse Hessian matrix. 73 | /// The L-BFGS routine stores the computation results of previous \ref m 74 | /// iterations to approximate the inverse Hessian matrix of the current 75 | /// iteration. This parameter controls the size of the limited memories 76 | /// (corrections). The default value is \c 6. Values less than \c 3 are 77 | /// not recommended. Large values will result in excessive computing time. 78 | /// 79 | int m; 80 | /// 81 | /// Absolute tolerance for convergence test. 82 | /// This parameter determines the absolute accuracy \f$\epsilon_{abs}\f$ 83 | /// with which the solution is to be found. A minimization terminates when 84 | /// \f$||g|| < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$, 85 | /// where \f$||\cdot||\f$ denotes the Euclidean (L2) norm. The default value is 86 | /// \c 1e-5. 87 | /// 88 | Scalar epsilon; 89 | /// 90 | /// Relative tolerance for convergence test. 91 | /// This parameter determines the relative accuracy \f$\epsilon_{rel}\f$ 92 | /// with which the solution is to be found. A minimization terminates when 93 | /// \f$||g|| < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$, 94 | /// where \f$||\cdot||\f$ denotes the Euclidean (L2) norm. The default value is 95 | /// \c 1e-5. 96 | /// 97 | Scalar epsilon_rel; 98 | /// 99 | /// Distance for delta-based convergence test. 100 | /// This parameter determines the distance \f$d\f$ to compute the 101 | /// rate of decrease of the objective function, 102 | /// \f$f_{k-d}(x)-f_k(x)\f$, where \f$k\f$ is the current iteration 103 | /// step. If the value of this parameter is zero, the delta-based convergence 104 | /// test will not be performed. The default value is \c 0. 105 | /// 106 | int past; 107 | /// 108 | /// Delta for convergence test. 109 | /// The algorithm stops when the following condition is met, 110 | /// \f$|f_{k-d}(x)-f_k(x)|<\delta\cdot\max(1, |f_k(x)|, |f_{k-d}(x)|)\f$, where \f$f_k(x)\f$ is 111 | /// the current function value, and \f$f_{k-d}(x)\f$ is the function value 112 | /// \f$d\f$ iterations ago (specified by the \ref past parameter). 113 | /// The default value is \c 0. 114 | /// 115 | Scalar delta; 116 | /// 117 | /// The maximum number of iterations. 118 | /// The optimization process is terminated when the iteration count 119 | /// exceeds this parameter. Setting this parameter to zero continues an 120 | /// optimization process until a convergence or error. The default value 121 | /// is \c 0. 122 | /// 123 | int max_iterations; 124 | /// 125 | /// The line search termination condition. 126 | /// This parameter specifies the line search termination condition that will be used 127 | /// by the LBFGS routine. The default value is `LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE`. 128 | /// 129 | int linesearch; 130 | /// 131 | /// The maximum number of trials for the line search. 132 | /// This parameter controls the number of function and gradients evaluations 133 | /// per iteration for the line search routine. The default value is \c 20. 134 | /// 135 | int max_linesearch; 136 | /// 137 | /// The minimum step length allowed in the line search. 138 | /// The default value is \c 1e-20. Usually this value does not need to be 139 | /// modified. 140 | /// 141 | Scalar min_step; 142 | /// 143 | /// The maximum step length allowed in the line search. 144 | /// The default value is \c 1e+20. Usually this value does not need to be 145 | /// modified. 146 | /// 147 | Scalar max_step; 148 | /// 149 | /// A parameter to control the accuracy of the line search routine. 150 | /// The default value is \c 1e-4. This parameter should be greater 151 | /// than zero and smaller than \c 0.5. 152 | /// 153 | Scalar ftol; 154 | /// 155 | /// The coefficient for the Wolfe condition. 156 | /// This parameter is valid only when the line-search 157 | /// algorithm is used with the Wolfe condition. 158 | /// The default value is \c 0.9. This parameter should be greater 159 | /// the \ref ftol parameter and smaller than \c 1.0. 160 | /// 161 | Scalar wolfe; 162 | 163 | public: 164 | /// 165 | /// Constructor for L-BFGS parameters. 166 | /// Default values for parameters will be set when the object is created. 167 | /// 168 | LBFGSParam() 169 | { 170 | // clang-format off 171 | m = 6; 172 | epsilon = Scalar(1e-5); 173 | epsilon_rel = Scalar(1e-5); 174 | past = 0; 175 | delta = Scalar(0); 176 | max_iterations = 0; 177 | linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE; 178 | max_linesearch = 20; 179 | min_step = Scalar(1e-20); 180 | max_step = Scalar(1e+20); 181 | ftol = Scalar(1e-4); 182 | wolfe = Scalar(0.9); 183 | // clang-format on 184 | } 185 | 186 | /// 187 | /// Checking the validity of L-BFGS parameters. 188 | /// An `std::invalid_argument` exception will be thrown if some parameter 189 | /// is invalid. 190 | /// 191 | inline void check_param() const 192 | { 193 | if (m <= 0) 194 | throw std::invalid_argument("'m' must be positive"); 195 | if (epsilon < 0) 196 | throw std::invalid_argument("'epsilon' must be non-negative"); 197 | if (epsilon_rel < 0) 198 | throw std::invalid_argument("'epsilon_rel' must be non-negative"); 199 | if (past < 0) 200 | throw std::invalid_argument("'past' must be non-negative"); 201 | if (delta < 0) 202 | throw std::invalid_argument("'delta' must be non-negative"); 203 | if (max_iterations < 0) 204 | throw std::invalid_argument("'max_iterations' must be non-negative"); 205 | if (linesearch < LBFGS_LINESEARCH_BACKTRACKING_ARMIJO || 206 | linesearch > LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE) 207 | throw std::invalid_argument("unsupported line search termination condition"); 208 | if (max_linesearch <= 0) 209 | throw std::invalid_argument("'max_linesearch' must be positive"); 210 | if (min_step < 0) 211 | throw std::invalid_argument("'min_step' must be positive"); 212 | if (max_step < min_step) 213 | throw std::invalid_argument("'max_step' must be greater than 'min_step'"); 214 | if (ftol <= 0 || ftol >= 0.5) 215 | throw std::invalid_argument("'ftol' must satisfy 0 < ftol < 0.5"); 216 | if (wolfe <= ftol || wolfe >= 1) 217 | throw std::invalid_argument("'wolfe' must satisfy ftol < wolfe < 1"); 218 | } 219 | }; 220 | 221 | /// 222 | /// Parameters to control the L-BFGS-B algorithm. 223 | /// 224 | template 225 | class LBFGSBParam 226 | { 227 | public: 228 | /// 229 | /// The number of corrections to approximate the inverse Hessian matrix. 230 | /// The L-BFGS-B routine stores the computation results of previous \ref m 231 | /// iterations to approximate the inverse Hessian matrix of the current 232 | /// iteration. This parameter controls the size of the limited memories 233 | /// (corrections). The default value is \c 6. Values less than \c 3 are 234 | /// not recommended. Large values will result in excessive computing time. 235 | /// 236 | int m; 237 | /// 238 | /// Absolute tolerance for convergence test. 239 | /// This parameter determines the absolute accuracy \f$\epsilon_{abs}\f$ 240 | /// with which the solution is to be found. A minimization terminates when 241 | /// \f$||Pg||_{\infty} < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$, 242 | /// where \f$||x||\f$ denotes the Euclidean (L2) norm of \f$x\f$, and 243 | /// \f$Pg=P(x-g,l,u)-x\f$ is the projected gradient. The default value is 244 | /// \c 1e-5. 245 | /// 246 | Scalar epsilon; 247 | /// 248 | /// Relative tolerance for convergence test. 249 | /// This parameter determines the relative accuracy \f$\epsilon_{rel}\f$ 250 | /// with which the solution is to be found. A minimization terminates when 251 | /// \f$||Pg||_{\infty} < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$, 252 | /// where \f$||x||\f$ denotes the Euclidean (L2) norm of \f$x\f$, and 253 | /// \f$Pg=P(x-g,l,u)-x\f$ is the projected gradient. The default value is 254 | /// \c 1e-5. 255 | /// 256 | Scalar epsilon_rel; 257 | /// 258 | /// Distance for delta-based convergence test. 259 | /// This parameter determines the distance \f$d\f$ to compute the 260 | /// rate of decrease of the objective function, 261 | /// \f$f_{k-d}(x)-f_k(x)\f$, where \f$k\f$ is the current iteration 262 | /// step. If the value of this parameter is zero, the delta-based convergence 263 | /// test will not be performed. The default value is \c 1. 264 | /// 265 | int past; 266 | /// 267 | /// Delta for convergence test. 268 | /// The algorithm stops when the following condition is met, 269 | /// \f$|f_{k-d}(x)-f_k(x)|<\delta\cdot\max(1, |f_k(x)|, |f_{k-d}(x)|)\f$, where \f$f_k(x)\f$ is 270 | /// the current function value, and \f$f_{k-d}(x)\f$ is the function value 271 | /// \f$d\f$ iterations ago (specified by the \ref past parameter). 272 | /// The default value is \c 1e-10. 273 | /// 274 | Scalar delta; 275 | /// 276 | /// The maximum number of iterations. 277 | /// The optimization process is terminated when the iteration count 278 | /// exceeds this parameter. Setting this parameter to zero continues an 279 | /// optimization process until a convergence or error. The default value 280 | /// is \c 0. 281 | /// 282 | int max_iterations; 283 | /// 284 | /// The maximum number of iterations in the subspace minimization. 285 | /// This parameter controls the number of iterations in the subspace 286 | /// minimization routine. The default value is \c 10. 287 | /// 288 | int max_submin; 289 | /// 290 | /// The maximum number of trials for the line search. 291 | /// This parameter controls the number of function and gradients evaluations 292 | /// per iteration for the line search routine. The default value is \c 20. 293 | /// 294 | int max_linesearch; 295 | /// 296 | /// The minimum step length allowed in the line search. 297 | /// The default value is \c 1e-20. Usually this value does not need to be 298 | /// modified. 299 | /// 300 | Scalar min_step; 301 | /// 302 | /// The maximum step length allowed in the line search. 303 | /// The default value is \c 1e+20. Usually this value does not need to be 304 | /// modified. 305 | /// 306 | Scalar max_step; 307 | /// 308 | /// A parameter to control the accuracy of the line search routine. 309 | /// The default value is \c 1e-4. This parameter should be greater 310 | /// than zero and smaller than \c 0.5. 311 | /// 312 | Scalar ftol; 313 | /// 314 | /// The coefficient for the Wolfe condition. 315 | /// This parameter is valid only when the line-search 316 | /// algorithm is used with the Wolfe condition. 317 | /// The default value is \c 0.9. This parameter should be greater 318 | /// the \ref ftol parameter and smaller than \c 1.0. 319 | /// 320 | Scalar wolfe; 321 | 322 | public: 323 | /// 324 | /// Constructor for L-BFGS-B parameters. 325 | /// Default values for parameters will be set when the object is created. 326 | /// 327 | LBFGSBParam() 328 | { 329 | // clang-format off 330 | m = 6; 331 | epsilon = Scalar(1e-5); 332 | epsilon_rel = Scalar(1e-5); 333 | past = 1; 334 | delta = Scalar(1e-10); 335 | max_iterations = 0; 336 | max_submin = 10; 337 | max_linesearch = 20; 338 | min_step = Scalar(1e-20); 339 | max_step = Scalar(1e+20); 340 | ftol = Scalar(1e-4); 341 | wolfe = Scalar(0.9); 342 | // clang-format on 343 | } 344 | 345 | /// 346 | /// Checking the validity of L-BFGS-B parameters. 347 | /// An `std::invalid_argument` exception will be thrown if some parameter 348 | /// is invalid. 349 | /// 350 | inline void check_param() const 351 | { 352 | if (m <= 0) 353 | throw std::invalid_argument("'m' must be positive"); 354 | if (epsilon < 0) 355 | throw std::invalid_argument("'epsilon' must be non-negative"); 356 | if (epsilon_rel < 0) 357 | throw std::invalid_argument("'epsilon_rel' must be non-negative"); 358 | if (past < 0) 359 | throw std::invalid_argument("'past' must be non-negative"); 360 | if (delta < 0) 361 | throw std::invalid_argument("'delta' must be non-negative"); 362 | if (max_iterations < 0) 363 | throw std::invalid_argument("'max_iterations' must be non-negative"); 364 | if (max_submin < 0) 365 | throw std::invalid_argument("'max_submin' must be non-negative"); 366 | if (max_linesearch <= 0) 367 | throw std::invalid_argument("'max_linesearch' must be positive"); 368 | if (min_step < 0) 369 | throw std::invalid_argument("'min_step' must be positive"); 370 | if (max_step < min_step) 371 | throw std::invalid_argument("'max_step' must be greater than 'min_step'"); 372 | if (ftol <= 0 || ftol >= 0.5) 373 | throw std::invalid_argument("'ftol' must satisfy 0 < ftol < 0.5"); 374 | if (wolfe <= ftol || wolfe >= 1) 375 | throw std::invalid_argument("'wolfe' must satisfy ftol < wolfe < 1"); 376 | } 377 | }; 378 | 379 | } // namespace LBFGSpp 380 | 381 | #endif // LBFGSPP_PARAM_H 382 | -------------------------------------------------------------------------------- /include/LBFGSpp/SubspaceMin.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2020-2025 Yixuan Qiu 2 | // Under MIT license 3 | 4 | #ifndef LBFGSPP_SUBSPACE_MIN_H 5 | #define LBFGSPP_SUBSPACE_MIN_H 6 | 7 | #include 8 | #include 9 | #include 10 | #include "BFGSMat.h" 11 | 12 | /// \cond 13 | 14 | namespace LBFGSpp { 15 | 16 | // 17 | // Subspace minimization procedure of the L-BFGS-B algorithm, 18 | // mainly for internal use. 19 | // 20 | // The target of subspace minimization is to minimize the quadratic function m(x) 21 | // over the free variables, subject to the bound condition. 22 | // Free variables stand for coordinates that are not at the boundary in xcp, 23 | // the generalized Cauchy point. 24 | // 25 | // In the classical implementation of L-BFGS-B [1], the minimization is done by first 26 | // ignoring the box constraints, followed by a line search. Our implementation is 27 | // an exact minimization subject to the bounds, based on the BOXCQP algorithm [2]. 28 | // 29 | // Reference: 30 | // [1] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization. 31 | // [2] C. Voglis and I. E. Lagaris (2004). BOXCQP: An algorithm for bound constrained convex quadratic problems. 32 | // 33 | template 34 | class SubspaceMin 35 | { 36 | private: 37 | using Vector = Eigen::Matrix; 38 | using Matrix = Eigen::Matrix; 39 | using IndexSet = std::vector; 40 | 41 | // v[ind] 42 | static Vector subvec(const Vector& v, const IndexSet& ind) 43 | { 44 | const int nsub = ind.size(); 45 | Vector res(nsub); 46 | for (int i = 0; i < nsub; i++) 47 | res[i] = v[ind[i]]; 48 | return res; 49 | } 50 | 51 | // v[ind] = rhs 52 | static void subvec_assign(Vector& v, const IndexSet& ind, const Vector& rhs) 53 | { 54 | const int nsub = ind.size(); 55 | for (int i = 0; i < nsub; i++) 56 | v[ind[i]] = rhs[i]; 57 | } 58 | 59 | // Check whether the vector is within the bounds 60 | static bool in_bounds(const Vector& x, const Vector& lb, const Vector& ub) 61 | { 62 | const int n = x.size(); 63 | for (int i = 0; i < n; i++) 64 | { 65 | if (x[i] < lb[i] || x[i] > ub[i]) 66 | return false; 67 | } 68 | return true; 69 | } 70 | 71 | // Test convergence of P set 72 | static bool P_converged(const IndexSet& yP_set, const Vector& vecy, const Vector& vecl, const Vector& vecu) 73 | { 74 | const int nP = yP_set.size(); 75 | for (int i = 0; i < nP; i++) 76 | { 77 | const int coord = yP_set[i]; 78 | if (vecy[coord] < vecl[coord] || vecy[coord] > vecu[coord]) 79 | return false; 80 | } 81 | return true; 82 | } 83 | 84 | // Test convergence of L set 85 | static bool L_converged(const IndexSet& yL_set, const Vector& lambda) 86 | { 87 | const int nL = yL_set.size(); 88 | for (int i = 0; i < nL; i++) 89 | { 90 | const int coord = yL_set[i]; 91 | if (lambda[coord] < Scalar(0)) 92 | return false; 93 | } 94 | return true; 95 | } 96 | 97 | // Test convergence of L set 98 | static bool U_converged(const IndexSet& yU_set, const Vector& mu) 99 | { 100 | const int nU = yU_set.size(); 101 | for (int i = 0; i < nU; i++) 102 | { 103 | const int coord = yU_set[i]; 104 | if (mu[coord] < Scalar(0)) 105 | return false; 106 | } 107 | return true; 108 | } 109 | 110 | public: 111 | // bfgs: An object that represents the BFGS approximation matrix. 112 | // x0: Current parameter vector. 113 | // xcp: Computed generalized Cauchy point. 114 | // g: Gradient at x0. 115 | // lb: Lower bounds for x. 116 | // ub: Upper bounds for x. 117 | // Wd: W'(xcp - x0) 118 | // newact_set: Coordinates that newly become active during the GCP procedure. 119 | // fv_set: Free variable set. 120 | // maxit: Maximum number of iterations. 121 | // drt: The output direction vector, drt = xsm - x0. 122 | static void subspace_minimize( 123 | const BFGSMat& bfgs, const Vector& x0, const Vector& xcp, const Vector& g, 124 | const Vector& lb, const Vector& ub, const Vector& Wd, const IndexSet& newact_set, const IndexSet& fv_set, int maxit, 125 | Vector& drt) 126 | { 127 | // std::cout << "========================= Entering subspace minimization =========================\n\n"; 128 | 129 | // d = xcp - x0 130 | drt.noalias() = xcp - x0; 131 | // Size of free variables 132 | const int nfree = fv_set.size(); 133 | // If there is no free variable, simply return drt 134 | if (nfree < 1) 135 | { 136 | // std::cout << "========================= (Early) leaving subspace minimization =========================\n\n"; 137 | return; 138 | } 139 | 140 | // std::cout << "New active set = [ "; for(std::size_t i = 0; i < newact_set.size(); i++) std::cout << newact_set[i] << " "; std::cout << "]\n"; 141 | // std::cout << "Free variable set = [ "; for(std::size_t i = 0; i < fv_set.size(); i++) std::cout << fv_set[i] << " "; std::cout << "]\n\n"; 142 | 143 | // Extract the rows of W in the free variable set 144 | Matrix WF = bfgs.Wb(fv_set); 145 | // Compute F'BAb = -F'WMW'AA'd 146 | Vector vecc(nfree); 147 | bfgs.compute_FtBAb(WF, fv_set, newact_set, Wd, drt, vecc); 148 | // Set the vector c=F'BAb+F'g for linear term, and vectors l and u for the new bounds 149 | Vector vecl(nfree), vecu(nfree); 150 | for (int i = 0; i < nfree; i++) 151 | { 152 | const int coord = fv_set[i]; 153 | vecl[i] = lb[coord] - x0[coord]; 154 | vecu[i] = ub[coord] - x0[coord]; 155 | vecc[i] += g[coord]; 156 | } 157 | // Solve y = -inv(B[F, F]) * c 158 | Vector vecy(nfree); 159 | bfgs.solve_PtBP(WF, -vecc, vecy); 160 | // Test feasibility 161 | // If yes, then the solution has been found 162 | if (in_bounds(vecy, vecl, vecu)) 163 | { 164 | subvec_assign(drt, fv_set, vecy); 165 | return; 166 | } 167 | // Otherwise, enter the iterations 168 | 169 | // Make a copy of y as a fallback solution 170 | Vector yfallback = vecy; 171 | // Dual variables 172 | Vector lambda = Vector::Zero(nfree), mu = Vector::Zero(nfree); 173 | 174 | // Iterations 175 | IndexSet L_set, U_set, P_set, yL_set, yU_set, yP_set; 176 | L_set.reserve(nfree / 3); 177 | yL_set.reserve(nfree / 3); 178 | U_set.reserve(nfree / 3); 179 | yU_set.reserve(nfree / 3); 180 | P_set.reserve(nfree); 181 | yP_set.reserve(nfree); 182 | int k; 183 | for (k = 0; k < maxit; k++) 184 | { 185 | // Construct the L, U, and P sets, and then update values 186 | // Indices in original drt vector 187 | L_set.clear(); 188 | U_set.clear(); 189 | P_set.clear(); 190 | // Indices in y 191 | yL_set.clear(); 192 | yU_set.clear(); 193 | yP_set.clear(); 194 | for (int i = 0; i < nfree; i++) 195 | { 196 | const int coord = fv_set[i]; 197 | const Scalar li = vecl[i], ui = vecu[i]; 198 | if ((vecy[i] < li) || (vecy[i] == li && lambda[i] >= Scalar(0))) 199 | { 200 | L_set.push_back(coord); 201 | yL_set.push_back(i); 202 | vecy[i] = li; 203 | mu[i] = Scalar(0); 204 | } 205 | else if ((vecy[i] > ui) || (vecy[i] == ui && mu[i] >= Scalar(0))) 206 | { 207 | U_set.push_back(coord); 208 | yU_set.push_back(i); 209 | vecy[i] = ui; 210 | lambda[i] = Scalar(0); 211 | } 212 | else 213 | { 214 | P_set.push_back(coord); 215 | yP_set.push_back(i); 216 | lambda[i] = Scalar(0); 217 | mu[i] = Scalar(0); 218 | } 219 | } 220 | 221 | /* std::cout << "** Iter " << k << " **\n"; 222 | std::cout << " L = [ "; for(std::size_t i = 0; i < L_set.size(); i++) std::cout << L_set[i] << " "; std::cout << "]\n"; 223 | std::cout << " U = [ "; for(std::size_t i = 0; i < U_set.size(); i++) std::cout << U_set[i] << " "; std::cout << "]\n"; 224 | std::cout << " P = [ "; for(std::size_t i = 0; i < P_set.size(); i++) std::cout << P_set[i] << " "; std::cout << "]\n\n"; */ 225 | 226 | // Extract the rows of W in the P set 227 | Matrix WP = bfgs.Wb(P_set); 228 | // Solve y[P] = -inv(B[P, P]) * (B[P, L] * l[L] + B[P, U] * u[U] + c[P]) 229 | const int nP = P_set.size(); 230 | if (nP > 0) 231 | { 232 | Vector rhs = subvec(vecc, yP_set); 233 | Vector lL = subvec(vecl, yL_set); 234 | Vector uU = subvec(vecu, yU_set); 235 | Vector tmp(nP); 236 | bool nonzero = bfgs.apply_PtBQv(WP, L_set, lL, tmp, true); 237 | if (nonzero) 238 | rhs.noalias() += tmp; 239 | nonzero = bfgs.apply_PtBQv(WP, U_set, uU, tmp, true); 240 | if (nonzero) 241 | rhs.noalias() += tmp; 242 | 243 | bfgs.solve_PtBP(WP, -rhs, tmp); 244 | subvec_assign(vecy, yP_set, tmp); 245 | } 246 | 247 | // Solve lambda[L] = B[L, F] * y + c[L] 248 | const int nL = L_set.size(); 249 | const int nU = U_set.size(); 250 | Vector Fy; 251 | if (nL > 0 || nU > 0) 252 | bfgs.apply_WtPv(fv_set, vecy, Fy); 253 | if (nL > 0) 254 | { 255 | Vector res; 256 | bfgs.apply_PtWMv(L_set, Fy, res, Scalar(-1)); 257 | res.noalias() += subvec(vecc, yL_set) + bfgs.theta() * subvec(vecy, yL_set); 258 | subvec_assign(lambda, yL_set, res); 259 | } 260 | 261 | // Solve mu[U] = -B[U, F] * y - c[U] 262 | if (nU > 0) 263 | { 264 | Vector negRes; 265 | bfgs.apply_PtWMv(U_set, Fy, negRes, Scalar(-1)); 266 | negRes.noalias() += subvec(vecc, yU_set) + bfgs.theta() * subvec(vecy, yU_set); 267 | subvec_assign(mu, yU_set, -negRes); 268 | } 269 | 270 | // Test convergence 271 | if (L_converged(yL_set, lambda) && U_converged(yU_set, mu) && P_converged(yP_set, vecy, vecl, vecu)) 272 | break; 273 | } 274 | 275 | // If the iterations do not converge, try the projection 276 | if (k >= maxit) 277 | { 278 | vecy.noalias() = vecy.cwiseMax(vecl).cwiseMin(vecu); 279 | subvec_assign(drt, fv_set, vecy); 280 | // Test whether drt is a descent direction 281 | Scalar dg = drt.dot(g); 282 | // If yes, return the result 283 | if (dg <= -std::numeric_limits::epsilon()) 284 | return; 285 | 286 | // If not, fall back to the projected unconstrained solution 287 | vecy.noalias() = yfallback.cwiseMax(vecl).cwiseMin(vecu); 288 | subvec_assign(drt, fv_set, vecy); 289 | dg = drt.dot(g); 290 | if (dg <= -std::numeric_limits::epsilon()) 291 | return; 292 | 293 | // If still not, fall back to the unconstrained solution 294 | subvec_assign(drt, fv_set, yfallback); 295 | return; 296 | } 297 | 298 | // std::cout << "** Minimization finished in " << k + 1 << " iteration(s) **\n\n"; 299 | // std::cout << "========================= Leaving subspace minimization =========================\n\n"; 300 | 301 | subvec_assign(drt, fv_set, vecy); 302 | } 303 | }; 304 | 305 | } // namespace LBFGSpp 306 | 307 | /// \endcond 308 | 309 | #endif // LBFGSPP_SUBSPACE_MIN_H 310 | --------------------------------------------------------------------------------