├── .clang-format
├── .gitignore
├── AUTHORS.md
├── CHANGELOG.md
├── CMakeLists.txt
├── LICENSE.md
├── README.md
├── config.cmake.in
├── doxygen
    └── Doxyfile
├── examples
    ├── CMakeLists.txt
    ├── example-quadratic.cpp
    ├── example-rosenbrock-box.cpp
    ├── example-rosenbrock-bracketing.cpp
    ├── example-rosenbrock-comparison.cpp
    └── example-rosenbrock.cpp
└── include
    ├── LBFGS.h
    ├── LBFGSB.h
    └── LBFGSpp
        ├── BFGSMat.h
        ├── BKLDLT.h
        ├── Cauchy.h
        ├── LineSearchBacktracking.h
        ├── LineSearchBracketing.h
        ├── LineSearchMoreThuente.h
        ├── LineSearchNocedalWright.h
        ├── Param.h
        └── SubspaceMin.h


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  WebKit
  4 | AccessModifierOffset: -4
  5 | AlignAfterOpenBracket: Align
  6 | AlignArrayOfStructures: None
  7 | AlignConsecutiveMacros: false
  8 | AlignConsecutiveAssignments: false
  9 | AlignConsecutiveBitFields: false
 10 | AlignConsecutiveDeclarations: false
 11 | AlignEscapedNewlines: Left
 12 | AlignOperands:   false
 13 | AlignTrailingComments: true
 14 | AllowAllArgumentsOnNextLine: true
 15 | AllowAllConstructorInitializersOnNextLine: true
 16 | AllowAllParametersOfDeclarationOnNextLine: true
 17 | AllowShortEnumsOnASingleLine: true
 18 | AllowShortBlocksOnASingleLine: false
 19 | AllowShortCaseLabelsOnASingleLine: false
 20 | AllowShortFunctionsOnASingleLine: All
 21 | AllowShortLambdasOnASingleLine: All
 22 | AllowShortIfStatementsOnASingleLine: Never
 23 | AllowShortLoopsOnASingleLine: false
 24 | AlwaysBreakAfterDefinitionReturnType: None
 25 | AlwaysBreakAfterReturnType: None
 26 | AlwaysBreakBeforeMultilineStrings: false
 27 | AlwaysBreakTemplateDeclarations: MultiLine
 28 | AttributeMacros:
 29 |   - __capability
 30 | BinPackArguments: true
 31 | BinPackParameters: true
 32 | BraceWrapping:
 33 |   AfterCaseLabel:  true
 34 |   AfterClass:      true
 35 |   AfterControlStatement: true
 36 |   AfterEnum:       true
 37 |   AfterFunction:   true
 38 |   AfterNamespace:  false
 39 |   AfterObjCDeclaration: false
 40 |   AfterStruct:     true
 41 |   AfterUnion:      true
 42 |   AfterExternBlock: false
 43 |   BeforeCatch:     true
 44 |   BeforeElse:      true
 45 |   BeforeLambdaBody: false
 46 |   BeforeWhile:     false
 47 |   IndentBraces:    false
 48 |   SplitEmptyFunction: false
 49 |   SplitEmptyRecord: false
 50 |   SplitEmptyNamespace: false
 51 | BreakBeforeBinaryOperators: None
 52 | BreakBeforeConceptDeclarations: true
 53 | BreakBeforeBraces: Custom
 54 | BreakBeforeInheritanceComma: false
 55 | BreakInheritanceList: AfterColon
 56 | BreakBeforeTernaryOperators: false
 57 | BreakConstructorInitializersBeforeComma: false
 58 | BreakConstructorInitializers: AfterColon
 59 | BreakAfterJavaFieldAnnotations: false
 60 | BreakStringLiterals: true
 61 | ColumnLimit:     0
 62 | CommentPragmas:  '^ IWYU pragma:'
 63 | CompactNamespaces: false
 64 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 65 | ConstructorInitializerIndentWidth: 4
 66 | ContinuationIndentWidth: 4
 67 | Cpp11BracedListStyle: true
 68 | DeriveLineEnding: true
 69 | DerivePointerAlignment: true
 70 | DisableFormat:   false
 71 | EmptyLineAfterAccessModifier: Never
 72 | EmptyLineBeforeAccessModifier: LogicalBlock
 73 | ExperimentalAutoDetectBinPacking: false
 74 | FixNamespaceComments: true
 75 | ForEachMacros:
 76 |   - foreach
 77 |   - Q_FOREACH
 78 |   - BOOST_FOREACH
 79 | IfMacros:
 80 |   - KJ_IF_MAYBE
 81 | IncludeBlocks:   Preserve
 82 | IncludeCategories:
 83 |   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
 84 |     Priority:        2
 85 |     SortPriority:    0
 86 |     CaseSensitive:   false
 87 |   - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
 88 |     Priority:        3
 89 |     SortPriority:    0
 90 |     CaseSensitive:   false
 91 |   - Regex:           '.*'
 92 |     Priority:        1
 93 |     SortPriority:    0
 94 |     CaseSensitive:   false
 95 | IncludeIsMainRegex: '(Test)?$'
 96 | IncludeIsMainSourceRegex: ''
 97 | IndentAccessModifiers: false
 98 | IndentCaseLabels: false
 99 | IndentCaseBlocks: false
100 | IndentGotoLabels: true
101 | IndentPPDirectives: None
102 | IndentExternBlock: AfterExternBlock
103 | IndentRequires:  false
104 | IndentWidth:     4
105 | IndentWrappedFunctionNames: false
106 | InsertTrailingCommas: None
107 | JavaScriptQuotes: Leave
108 | JavaScriptWrapImports: true
109 | KeepEmptyLinesAtTheStartOfBlocks: false
110 | LambdaBodyIndentation: Signature
111 | MacroBlockBegin: ''
112 | MacroBlockEnd:   ''
113 | MaxEmptyLinesToKeep: 1
114 | NamespaceIndentation: Inner
115 | ObjCBinPackProtocolList: Auto
116 | ObjCBlockIndentWidth: 4
117 | ObjCBreakBeforeNestedBlockParam: true
118 | ObjCSpaceAfterProperty: true
119 | ObjCSpaceBeforeProtocolList: true
120 | PenaltyBreakAssignment: 2
121 | PenaltyBreakBeforeFirstCallParameter: 19
122 | PenaltyBreakComment: 300
123 | PenaltyBreakFirstLessLess: 120
124 | PenaltyBreakString: 1000
125 | PenaltyBreakTemplateDeclaration: 10
126 | PenaltyExcessCharacter: 1000000
127 | PenaltyReturnTypeOnItsOwnLine: 60
128 | PenaltyIndentedWhitespace: 0
129 | PointerAlignment: Left
130 | PPIndentWidth:   -1
131 | ReferenceAlignment: Pointer
132 | ReflowComments:  true
133 | ShortNamespaceLines: 1
134 | SortIncludes:    false
135 | SortJavaStaticImport: Before
136 | SortUsingDeclarations: false
137 | SpaceAfterCStyleCast: true
138 | SpaceAfterLogicalNot: false
139 | SpaceAfterTemplateKeyword: true
140 | SpaceBeforeAssignmentOperators: true
141 | SpaceBeforeCaseColon: false
142 | SpaceBeforeCpp11BracedList: false
143 | SpaceBeforeCtorInitializerColon: true
144 | SpaceBeforeInheritanceColon: true
145 | SpaceBeforeParens: ControlStatements
146 | SpaceAroundPointerQualifiers: Default
147 | SpaceBeforeRangeBasedForLoopColon: true
148 | SpaceInEmptyBlock: false
149 | SpaceInEmptyParentheses: false
150 | SpacesBeforeTrailingComments: 2
151 | SpacesInAngles:  Never
152 | SpacesInConditionalStatement: false
153 | SpacesInContainerLiterals: true
154 | SpacesInCStyleCastParentheses: false
155 | SpacesInLineCommentPrefix:
156 |   Minimum:         1
157 |   Maximum:         -1
158 | SpacesInParentheses: false
159 | SpacesInSquareBrackets: false
160 | SpaceBeforeSquareBrackets: false
161 | BitFieldColonSpacing: Both
162 | Standard:        c++03
163 | StatementAttributeLikeMacros:
164 |   - Q_EMIT
165 | StatementMacros:
166 |   - Q_UNUSED
167 |   - QT_REQUIRE_VERSION
168 | TabWidth:        8
169 | UseCRLF:         false
170 | UseTab:          Never
171 | WhitespaceSensitiveMacros:
172 |   - STRINGIZE
173 |   - PP_STRINGIZE
174 |   - BOOST_PP_STRINGIZE
175 |   - NS_SWIFT_NAME
176 |   - CF_SWIFT_NAME
177 | ...
178 | 
179 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.out
 3 | include/Eigen/*
 4 | archive/*
 5 | issues/*
 6 | .settings/*
 7 | .project
 8 | .cproject
 9 | /Debug/
10 | /Release/
11 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | The LBFGS++ library was adapted from the libLBFGS library
 2 | (https://github.com/chokkan/liblbfgs), written by
 3 | Naoaki Okazaki <<okazaki@c.titech.ac.jp>>.
 4 | 
 5 | The files
 6 | 
 7 | - `include/LBFGS/LineSearchBracketing.h`
 8 | - `include/LBFGS/LineSearchNocedalWright.h`
 9 | 
10 | were contributed by Dirk Toewe <<DirkToewe@GoogleMail.com>>.
11 | 
12 | Other part of LBFGS++ was written by Yixuan Qiu <<yixuan.qiu@cos.name>>.
13 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [0.4.0] - 2025-04-20
 2 | 
 3 | ### Added
 4 | 
 5 | - Added functions `final_approx_hessian()` and `final_approx_inverse_hessian()` to `LBFGSSolver`
 6 |   to retrieve the final approximate Hessian information
 7 |   ([#42](https://github.com/yixuan/LBFGSpp/issues/42), [#43](https://github.com/yixuan/LBFGSpp/issues/43))
 8 | - Added CMake scripts to build examples ([#38](https://github.com/yixuan/LBFGSpp/pull/38)),
 9 |   contributed by [@pjknowles](https://github.com/pjknowles)
10 | 
11 | 
12 | 
13 | ## [0.3.0] - 2023-09-06
14 | 
15 | ### Added
16 | 
17 | - Added functions `final_grad()` and `final_grad_norm()` to `LBFGSSolver`
18 |   and `LBFGSBSolver` to retrieve the final gradient information
19 |   ([#12](https://github.com/yixuan/LBFGSpp/issues/12))
20 | 
21 | ### Changed
22 | 
23 | - `LBFGS++` now requires C++11
24 | - The line search classes now have a unified API for both `LBFGSSolver` and `LBFGSBSolver`
25 | - The Moré-Thuente line search algorithm `LineSearchMoreThuente` now can also be used
26 |   in the L-BFGS solver `LBFGSSolver`
27 | - Improved the numerical stability of `LineSearchNocedalWright`
28 |   ([#27](https://github.com/yixuan/LBFGSpp/issues/27))
29 | - Removed the unused variable `dg_hi` in `LineSearchNocedalWright`
30 |   ([#35](https://github.com/yixuan/LBFGSpp/issues/35))
31 | - Fixed some compiler warnings regarding shadowed variables
32 |   ([#36](https://github.com/yixuan/LBFGSpp/issues/36))
33 | 
34 | 
35 | 
36 | ## [0.2.0] - 2022-05-20
37 | 
38 | ### Added
39 | 
40 | - Added a CMake script for installation ([#24](https://github.com/yixuan/LBFGSpp/pull/24)),
41 |   contributed by [@steinmig](https://github.com/steinmig)
42 | 
43 | ### Changed
44 | 
45 | - The default line search method for `LBFGSSolver` has been changed from `LineSearchBacktracking`
46 |   to `LineSearchNocedalWright`, per the suggestion of [@mpayrits](https://github.com/mpayrits)
47 |   ([#25](https://github.com/yixuan/LBFGSpp/pull/25))
48 | - Fixed a few critical issues ([#9](https://github.com/yixuan/LBFGSpp/issues/9),
49 |   [#15](https://github.com/yixuan/LBFGSpp/issues/15),
50 |   [#21](https://github.com/yixuan/LBFGSpp/issues/21)), with big thanks to
51 |   [@mpayrits](https://github.com/mpayrits) ([#25](https://github.com/yixuan/LBFGSpp/pull/25))
52 | - Fixed one inconsistency with Moré and Thuente (1994) in the `LineSearchMoreThuente`
53 |   line search algorithm, pointed out by [@mpayrits](https://github.com/mpayrits)
54 |   ([#23](https://github.com/yixuan/LBFGSpp/issues/23))
55 | - The source code is now formatted using [Clang-Format](https://clang.llvm.org/docs/ClangFormat.html)
56 | 
57 | 
58 | 
59 | ## [0.1.0] - 2021-08-19
60 | 
61 | ### Added
62 | 
63 | - Initial Github release
64 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | project(lbfgspp VERSION 0.4.0 LANGUAGES CXX)
 3 | 
 4 | # + ----------------- +
 5 | # | BUILDING SETTINGS |
 6 | # + ----------------- +
 7 | 
 8 | if(NOT CMAKE_BUILD_TYPE)
 9 |   set(CMAKE_BUILD_TYPE Release)
10 | endif(NOT CMAKE_BUILD_TYPE)
11 | 
12 | # + ----------------- +
13 | # | COMPILATION FLAGS |
14 | # + ----------------- +
15 | 
16 | include(CheckCXXCompilerFlag)
17 | check_cxx_compiler_flag(-Wall WALL_SUPPORTED)
18 | if(${WALL_SUPPORTED})
19 |   add_compile_options(-Wall)
20 | endif()
21 | 
22 | # + --------------- +
23 | # | LBFGSpp LIBRARY |
24 | # + --------------- +
25 | 
26 | add_library(lbfgspp INTERFACE)
27 | 
28 | # + -------- +
29 | # | INCLUDES |
30 | # + -------- +
31 | 
32 | target_include_directories(lbfgspp INTERFACE
33 |   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
34 |   $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
35 | )
36 | 
37 | # + ----------------------- +
38 | # | FIND EXTERNAL LIBRARIES |
39 | # + ----------------------- +
40 | 
41 | find_package(Eigen3 3.0 REQUIRED)
42 | target_link_libraries(lbfgspp INTERFACE Eigen3::Eigen)
43 | message("-- Eigen3 version: " ${EIGEN3_VERSION_STRING})
44 | 
45 | # + ------------ +
46 | # | INSTALLATION |
47 | # + ------------ +
48 | 
49 | # Copy headers folder
50 | install(
51 |   DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
52 |   DESTINATION include
53 | )
54 | # Create an export set
55 | install(TARGETS lbfgspp EXPORT lbfgsppTargets)
56 | 
57 | include(CMakePackageConfigHelpers)
58 | # Version file
59 | write_basic_package_version_file(
60 |   ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config-version.cmake
61 |   VERSION ${PROJECT_VERSION}
62 |   COMPATIBILITY AnyNewerVersion
63 | )
64 | # Config file
65 | configure_package_config_file(
66 |   ${CMAKE_CURRENT_SOURCE_DIR}/config.cmake.in
67 |   ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config.cmake
68 |   INSTALL_DESTINATION lib/cmake/lbfgspp
69 | )
70 | # Targets files
71 | export(
72 |   EXPORT lbfgsppTargets
73 |   FILE ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-targets.cmake
74 | )
75 | install(
76 |   EXPORT lbfgsppTargets
77 |   FILE lbfgspp-targets.cmake
78 |   DESTINATION lib/cmake/lbfgspp
79 | )
80 | install(
81 |   FILES
82 |     ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config.cmake
83 |     ${CMAKE_CURRENT_BINARY_DIR}/lbfgspp-config-version.cmake
84 |   DESTINATION lib/cmake/lbfgspp
85 | )
86 | 
87 | add_subdirectory(examples)
88 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | ## The MIT License
 2 | 
 3 | Copyright (c) 1990 Jorge Nocedal
 4 | 
 5 | Copyright (c) 2007-2010 Naoaki Okazaki
 6 | 
 7 | Copyright (c) 2016-2023 Yixuan Qiu
 8 | 
 9 | Copyright (c) 2018-2023 Dirk Toewe
10 | 
11 | Permission is hereby granted, free of charge, to any person obtaining
12 | a copy of this software and associated documentation files (the
13 | "Software"), to deal in the Software without restriction, including
14 | without limitation the rights to use, copy, modify, merge, publish,
15 | distribute, sublicense, and/or sell copies of the Software, and to
16 | permit persons to whom the Software is furnished to do so, subject to
17 | the following conditions:
18 | 
19 | The above copyright notice and this permission notice shall be
20 | included in all copies or substantial portions of the Software.
21 | 
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
23 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
24 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
25 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
26 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
27 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
28 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LBFGS++ <img src="https://statr.me/images/sticker-lbfgspp.png" alt="LBFGS++" height="150px" align="right" />
  2 | 
  3 | > **UPDATE on 2020-03-06**: **LBFGS++** now includes a new L-BFGS-B solver for
  4 | > box-constrained optimization problems. Check the example below for its usage.
  5 | 
  6 | **LBFGS++** is a header-only C++ library that implements the Limited-memory
  7 | BFGS algorithm (L-BFGS) for unconstrained minimization problems, and a modified
  8 | version of the L-BFGS-B algorithm for box-constrained ones.
  9 | 
 10 | The code for the L-BFGS solver is derived and modified from the
 11 | [libLBFGS](https://github.com/chokkan/liblbfgs)
 12 | library developed by [Naoaki Okazaki](http://www.chokkan.org/).
 13 | 
 14 | **LBFGS++** is implemented as a header-only C++ library, whose only dependency,
 15 | [Eigen](http://eigen.tuxfamily.org/), is also header-only.
 16 | 
 17 | ## A Quick Example
 18 | 
 19 | To use **LBFGS++**, one needs to first define a functor to represent the
 20 | multivariate function to be minimized. It should return the objective function
 21 | value on a vector `x` and overwrite the vector `grad` with the gradient
 22 | evaluated on `x`. For example we could define the
 23 | [Rosenbrock function](https://en.wikipedia.org/wiki/Rosenbrock_function) in the
 24 | following way:
 25 | 
 26 | ```cpp
 27 | #include <Eigen/Core>
 28 | #include <iostream>
 29 | #include <LBFGS.h>
 30 | 
 31 | using Eigen::VectorXd;
 32 | using namespace LBFGSpp;
 33 | 
 34 | class Rosenbrock
 35 | {
 36 | private:
 37 |     int n;
 38 | public:
 39 |     Rosenbrock(int n_) : n(n_) {}
 40 |     double operator()(const VectorXd& x, VectorXd& grad)
 41 |     {
 42 |         double fx = 0.0;
 43 |         for(int i = 0; i < n; i += 2)
 44 |         {
 45 |             double t1 = 1.0 - x[i];
 46 |             double t2 = 10 * (x[i + 1] - x[i] * x[i]);
 47 |             grad[i + 1] = 20 * t2;
 48 |             grad[i]     = -2.0 * (x[i] * grad[i + 1] + t1);
 49 |             fx += t1 * t1 + t2 * t2;
 50 |         }
 51 |         return fx;
 52 |     }
 53 | };
 54 | ```
 55 | 
 56 | Then we just need to set up parameters, create solver object,
 57 | provide initial guess, and then run the minimization function.
 58 | 
 59 | ```cpp
 60 | int main()
 61 | {
 62 |     const int n = 10;
 63 |     // Set up parameters
 64 |     LBFGSParam<double> param;
 65 |     param.epsilon = 1e-6;
 66 |     param.max_iterations = 100;
 67 | 
 68 |     // Create solver and function object
 69 |     LBFGSSolver<double> solver(param);
 70 |     Rosenbrock fun(n);
 71 | 
 72 |     // Initial guess
 73 |     VectorXd x = VectorXd::Zero(n);
 74 |     // x will be overwritten to be the best point found
 75 |     double fx;
 76 |     int niter = solver.minimize(fun, x, fx);
 77 | 
 78 |     std::cout << niter << " iterations" << std::endl;
 79 |     std::cout << "x = \n" << x.transpose() << std::endl;
 80 |     std::cout << "f(x) = " << fx << std::endl;
 81 | 
 82 |     return 0;
 83 | }
 84 | ```
 85 | 
 86 | The example can then be compiled and run.
 87 | 
 88 | ```bash
 89 | $ g++ -I/path/to/eigen -I/path/to/lbfgspp/include -O2 example.cpp
 90 | $ ./a.out
 91 | 23 iterations
 92 | x =
 93 | 1 1 1 1 1 1 1 1 1 1
 94 | f(x) = 1.87948e-19
 95 | ```
 96 | 
 97 | You can also use a different line search algorithm by providing a second template parameter
 98 | to `LBFGSSolver`. For example, the code below illustrates the bracketing line search algorithm
 99 | (contributed by [@DirkToewe](https://github.com/DirkToewe)).
100 | 
101 | ```cpp
102 | int main()
103 | {
104 |     const int n = 10;
105 |     // Set up parameters
106 |     LBFGSParam<double> param;
107 |     param.epsilon = 1e-6;
108 |     param.max_iterations = 100;
109 | 
110 |     // Create solver and function object
111 |     LBFGSSolver<double, LineSearchBracketing> solver(param);
112 |     Rosenbrock fun(n);
113 | 
114 |     // Initial guess
115 |     VectorXd x = VectorXd::Zero(n);
116 |     // x will be overwritten to be the best point found
117 |     double fx;
118 |     int niter = solver.minimize(fun, x, fx);
119 | 
120 |     std::cout << niter << " iterations" << std::endl;
121 |     std::cout << "x = \n" << x.transpose() << std::endl;
122 |     std::cout << "f(x) = " << fx << std::endl;
123 | 
124 |     return 0;
125 | }
126 | ```
127 | 
128 | ## Box-constrained Problem
129 | 
130 | If the parameters to be optimized have simple bounds, then the
131 | L-BFGS-**B** solver class `LBFGSBSolver` can be used.
132 | The code is very similar to that of `LBFGSSolver`. Below is the same Rosenbrock
133 | example, but we require that all variables should be between 2 and 4.
134 | 
135 | ```cpp
136 | #include <Eigen/Core>
137 | #include <iostream>
138 | #include <LBFGSB.h>  // Note the different header file
139 | 
140 | using Eigen::VectorXd;
141 | using namespace LBFGSpp;
142 | 
143 | class Rosenbrock
144 | {
145 | private:
146 |     int n;
147 | public:
148 |     Rosenbrock(int n_) : n(n_) {}
149 |     double operator()(const VectorXd& x, VectorXd& grad)
150 |     {
151 |         double fx = 0.0;
152 |         for(int i = 0; i < n; i += 2)
153 |         {
154 |             double t1 = 1.0 - x[i];
155 |             double t2 = 10 * (x[i + 1] - x[i] * x[i]);
156 |             grad[i + 1] = 20 * t2;
157 |             grad[i]     = -2.0 * (x[i] * grad[i + 1] + t1);
158 |             fx += t1 * t1 + t2 * t2;
159 |         }
160 |         return fx;
161 |     }
162 | };
163 | 
164 | int main()
165 | {
166 |     const int n = 10;
167 |     // Set up parameters
168 |     LBFGSBParam<double> param;  // New parameter class
169 |     param.epsilon = 1e-6;
170 |     param.max_iterations = 100;
171 | 
172 |     // Create solver and function object
173 |     LBFGSBSolver<double> solver(param);  // New solver class
174 |     Rosenbrock fun(n);
175 | 
176 |     // Bounds
177 |     VectorXd lb = VectorXd::Constant(n, 2.0);
178 |     VectorXd ub = VectorXd::Constant(n, 4.0);
179 | 
180 |     // Initial guess
181 |     VectorXd x = VectorXd::Constant(n, 3.0);
182 | 
183 |     // x will be overwritten to be the best point found
184 |     double fx;
185 |     int niter = solver.minimize(fun, x, fx, lb, ub);
186 | 
187 |     std::cout << niter << " iterations" << std::endl;
188 |     std::cout << "x = \n" << x.transpose() << std::endl;
189 |     std::cout << "f(x) = " << fx << std::endl;
190 | 
191 |     return 0;
192 | }
193 | ```
194 | 
195 | Note that we also allow infinite values for the lower and upper bounds.
196 | In such cases one can define `ub[i] = std::numeric_limits<double>::infinity()`,
197 | for example.
198 | 
199 | ## Documentation
200 | 
201 | The [API reference](https://lbfgspp.statr.me/doc/) page contains the documentation
202 | of **LBFGS++** generated by [Doxygen](https://www.doxygen.nl/).
203 | 
204 | ## License
205 | 
206 | **LBFGS++** is an open source project under the MIT license.
207 | 


--------------------------------------------------------------------------------
/config.cmake.in:
--------------------------------------------------------------------------------
1 | # Dependencies
2 | include(CMakeFindDependencyMacro)
3 | 
4 | find_dependency(Eigen3 3.0 REQUIRED)
5 | 
6 | include(${CMAKE_CURRENT_LIST_DIR}/lbfgspp-targets.cmake)
7 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | foreach (source example-quadratic.cpp example-rosenbrock-box.cpp example-rosenbrock-bracketing.cpp example-rosenbrock-comparison.cpp example-rosenbrock.cpp)
2 |     get_filename_component(example ${source} NAME_WLE)
3 |     add_executable(${example} ${source})
4 |     set_property(TARGET ${example} PROPERTY CXX_STANDARD 17)
5 |     target_link_libraries(${example} PRIVATE lbfgspp Eigen3::Eigen)
6 | endforeach ()
7 | 


--------------------------------------------------------------------------------
/examples/example-quadratic.cpp:
--------------------------------------------------------------------------------
 1 | #include <Eigen/Core>
 2 | #include <iostream>
 3 | #include <LBFGS.h>
 4 | 
 5 | using Eigen::VectorXd;
 6 | using Eigen::MatrixXd;
 7 | using namespace LBFGSpp;
 8 | 
 9 | double foo(const VectorXd& x, VectorXd& grad)
10 | {
11 |     const int n = x.size();
12 |     VectorXd d(n);
13 |     for(int i = 0; i < n; i++)
14 |         d[i] = i;
15 | 
16 |     double f = (x - d).squaredNorm();
17 |     grad.noalias() = 2.0 * (x - d);
18 |     return f;
19 | }
20 | 
21 | int main()
22 | {
23 |     const int n = 10;
24 |     LBFGSParam<double> param;
25 |     LBFGSSolver<double> solver(param);
26 | 
27 |     VectorXd x = VectorXd::Zero(n);
28 |     double fx;
29 |     int niter = solver.minimize(foo, x, fx);
30 | 
31 |     std::cout << niter << " iterations" << std::endl;
32 |     std::cout << "x = \n" << x.transpose() << std::endl;
33 |     std::cout << "f(x) = " << fx << std::endl;
34 | 
35 |     return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/example-rosenbrock-box.cpp:
--------------------------------------------------------------------------------
 1 | #include <Eigen/Core>
 2 | #include <iostream>
 3 | #include <LBFGSB.h>
 4 | 
 5 | using namespace LBFGSpp;
 6 | 
 7 | typedef double Scalar;
 8 | typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
 9 | 
10 | // Example from the roptim R package
11 | // f(x) = (x[0] - 1)^2 + 4 * (x[1] - x[0]^2)^2 + ... + 4 * (x[end] - x[end - 1]^2)^2
12 | class Rosenbrock
13 | {
14 | private:
15 |     int n;
16 | public:
17 |     Rosenbrock(int n_) : n(n_) {}
18 |     Scalar operator()(const Vector& x, Vector& grad)
19 |     {
20 |         Scalar fx = (x[0] - 1.0) * (x[0] - 1.0);
21 |         grad[0] = 2 * (x[0] - 1) + 16 * (x[0] * x[0] - x[1]) * x[0];
22 |         for(int i = 1; i < n; i++)
23 |         {
24 |             fx += 4 * std::pow(x[i] - x[i - 1] * x[i - 1], 2);
25 |             if(i == n - 1)
26 |             {
27 |                 grad[i] = 8 * (x[i] - x[i - 1] * x[i - 1]);
28 |             } else {
29 |                 grad[i] = 8 * (x[i] - x[i - 1] * x[i - 1]) + 16 * (x[i] * x[i] - x[i + 1]) * x[i];
30 |             }
31 |         }
32 |         return fx;
33 |     }
34 | };
35 | 
36 | int main()
37 | {
38 |     const int n = 25;
39 |     LBFGSBParam<Scalar> param;
40 |     LBFGSBSolver<Scalar> solver(param);
41 |     Rosenbrock fun(n);
42 | 
43 |     // Variable bounds
44 |     Vector lb = Vector::Constant(n, 2.0);
45 |     Vector ub = Vector::Constant(n, 4.0);
46 |     // The third variable is unbounded
47 |     lb[2] = -std::numeric_limits<Scalar>::infinity();
48 |     ub[2] = std::numeric_limits<Scalar>::infinity();
49 |     // Initial values
50 |     Vector x = Vector::Constant(n, 3.0);
51 |     // Make some initial values at the bounds
52 |     x[0] = x[1] = 2.0;
53 |     x[5] = x[7] = 4.0;
54 | 
55 |     Scalar fx;
56 |     int niter = solver.minimize(fun, x, fx, lb, ub);
57 | 
58 |     std::cout << niter << " iterations" << std::endl;
59 |     std::cout << "x = \n" << x.transpose() << std::endl;
60 |     std::cout << "f(x) = " << fx << std::endl;
61 |     std::cout << "grad = " << solver.final_grad().transpose() << std::endl;
62 |     std::cout << "projected grad norm = " << solver.final_grad_norm() << std::endl;
63 | 
64 |     return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/examples/example-rosenbrock-bracketing.cpp:
--------------------------------------------------------------------------------
 1 | #include <Eigen/Core>
 2 | #include <iostream>
 3 | #include <LBFGS.h>
 4 | 
 5 | using Eigen::VectorXd;
 6 | using Eigen::MatrixXd;
 7 | using namespace LBFGSpp;
 8 | 
 9 | class Rosenbrock
10 | {
11 | private:
12 |     int n;
13 | public:
14 |     Rosenbrock(int n_) : n(n_) {}
15 |     double operator()(const VectorXd& x, VectorXd& grad)
16 |     {
17 |         double fx = 0.0;
18 |         for(int i = 0; i < n; i += 2)
19 |         {
20 |             double t1 = 1.0 - x[i];
21 |             double t2 = 10 * (x[i + 1] - x[i] * x[i]);
22 |             grad[i + 1] = 20 * t2;
23 |             grad[i]     = -2.0 * (x[i] * grad[i + 1] + t1);
24 |             fx += t1 * t1 + t2 * t2;
25 |         }
26 |         assert( ! std::isnan(fx) );
27 |         return fx;
28 |     }
29 | };
30 | 
31 | int main()
32 | {
33 |     LBFGSParam<double> param;
34 |     LBFGSSolver<double, LineSearchBracketing> solver(param);
35 | 
36 |     for( int n=2; n <= 16; n += 2 )
37 |     {
38 |         std::cout << "n = " << n << std::endl;
39 |         Rosenbrock fun(n);
40 |         for( int test=0; test < 1024; test++ )
41 |         {
42 |             VectorXd x = VectorXd::Random(n);
43 |             double fx;
44 |             int niter = solver.minimize(fun, x, fx);
45 | 
46 |             assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() );
47 |         }
48 |         std::cout << "Test passed!" << std::endl << std::endl;
49 |     }
50 | 
51 |     return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/examples/example-rosenbrock-comparison.cpp:
--------------------------------------------------------------------------------
 1 | #include <Eigen/Core>
 2 | #include <iostream>
 3 | #include <LBFGS.h>
 4 | 
 5 | using Eigen::VectorXd;
 6 | using Eigen::MatrixXd;
 7 | using namespace LBFGSpp;
 8 | 
 9 | class Rosenbrock
10 | {
11 | private:
12 |     int n;
13 |     ptrdiff_t ncalls;
14 | 
15 | public:
16 |     Rosenbrock(int n_) : n(n_), ncalls(0) {}
17 |     double operator()(const VectorXd& x, VectorXd& grad)
18 |     {
19 | //        std::cout << x << std::endl;
20 |         ncalls += 1;
21 | 
22 |         double fx = 0.0;
23 |         for(int i = 0; i < n; i += 2)
24 |         {
25 |             double t1 = 1.0 - x[i];
26 |             double t2 = 10 * (x[i + 1] - x[i] * x[i]);
27 |             grad[i + 1] = 20 * t2;
28 |             grad[i]     = -2.0 * (x[i] * grad[i + 1] + t1);
29 |             fx += t1 * t1 + t2 * t2;
30 |         }
31 |         assert( ! std::isnan(fx) );
32 |         return fx;
33 |     }
34 | 
35 |     const ptrdiff_t get_ncalls() {
36 |       return ncalls;
37 |     }
38 | };
39 | 
40 | int main()
41 | {
42 |     LBFGSParam<double> param;
43 |     param.    linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE;
44 |     param.max_linesearch = 256;
45 | 
46 |     LBFGSSolver<double, LineSearchBacktracking > solver_backtrack(param);
47 |     LBFGSSolver<double, LineSearchBracketing   > solver_bracket  (param);
48 |     LBFGSSolver<double, LineSearchNocedalWright> solver_nocedal  (param);
49 |     LBFGSSolver<double, LineSearchMoreThuente>   solver_more     (param);
50 | 
51 |     const int tests_per_n = 1024;
52 | 
53 |     for( int n=2; n <= 24; n += 2 )
54 |     {
55 |         std::cout << "n = " << n << std::endl;
56 |         Rosenbrock fun_backtrack(n),
57 |                    fun_bracket  (n),
58 |                    fun_nocedal  (n),
59 |                    fun_more     (n);
60 |         int niter_backtrack = 0,
61 |             niter_bracket   = 0,
62 |             niter_nocedal   = 0,
63 |             niter_more      = 0;
64 |         for( int test=0; test < tests_per_n; test++ )
65 |         {
66 |             VectorXd x, x0 = VectorXd::Random(n);
67 | 
68 |             double fx;
69 | 
70 |             x = x0; niter_backtrack += solver_backtrack.minimize(fun_backtrack, x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() );
71 |             x = x0; niter_bracket   += solver_bracket  .minimize(fun_bracket  , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() );
72 |             x = x0; niter_nocedal   += solver_nocedal  .minimize(fun_nocedal  , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() );
73 |             x = x0; niter_more      += solver_more     .minimize(fun_more     , x, fx); assert( ( (x.array() - 1.0).abs() < 1e-4 ).all() );
74 |         }
75 |         std::cout << "  Average #calls:" << std::endl;
76 |         std::cout << "  LineSearchBacktracking : " << (fun_backtrack.get_ncalls() / tests_per_n) << " calls, " << (niter_backtrack / tests_per_n) << " iterations" << std::endl;
77 |         std::cout << "  LineSearchBracketing   : " << (fun_bracket  .get_ncalls() / tests_per_n) << " calls, " << (niter_bracket   / tests_per_n) << " iterations" << std::endl;
78 |         std::cout << "  LineSearchNocedalWright: " << (fun_nocedal  .get_ncalls() / tests_per_n) << " calls, " << (niter_nocedal   / tests_per_n) << " iterations" << std::endl;
79 |         std::cout << "  LineSearchMoreThuente: "   << (fun_more     .get_ncalls() / tests_per_n) << " calls, " << (niter_more      / tests_per_n) << " iterations" << std::endl;
80 |     }
81 | 
82 |     return 0;
83 | }
84 | 


--------------------------------------------------------------------------------
/examples/example-rosenbrock.cpp:
--------------------------------------------------------------------------------
 1 | #include <Eigen/Core>
 2 | #include <iostream>
 3 | #include <LBFGS.h>
 4 | 
 5 | using Eigen::VectorXf;
 6 | using Eigen::MatrixXf;
 7 | using namespace LBFGSpp;
 8 | 
 9 | class Rosenbrock
10 | {
11 | private:
12 |     int n;
13 | public:
14 |     Rosenbrock(int n_) : n(n_) {}
15 |     float operator()(const VectorXf& x, VectorXf& grad)
16 |     {
17 |         float fx = 0.0;
18 |         for(int i = 0; i < n; i += 2)
19 |         {
20 |             float t1 = 1.0 - x[i];
21 |             float t2 = 10 * (x[i + 1] - x[i] * x[i]);
22 |             grad[i + 1] = 20 * t2;
23 |             grad[i]     = -2.0 * (x[i] * grad[i + 1] + t1);
24 |             fx += t1 * t1 + t2 * t2;
25 |         }
26 |         return fx;
27 |     }
28 | };
29 | 
30 | int main()
31 | {
32 |     const int n = 10;
33 |     LBFGSParam<float> param;
34 |     LBFGSSolver<float> solver(param);
35 |     Rosenbrock fun(n);
36 | 
37 |     VectorXf x = VectorXf::Zero(n);
38 |     float fx;
39 |     int niter = solver.minimize(fun, x, fx);
40 | 
41 |     std::cout << niter << " iterations" << std::endl;
42 |     std::cout << "x = \n" << x.transpose() << std::endl;
43 |     std::cout << "f(x) = " << fx << std::endl;
44 |     std::cout << "grad = " << solver.final_grad().transpose() << std::endl;
45 |     std::cout << "||grad|| = " << solver.final_grad_norm() << std::endl;
46 |     std::cout << "approx_hess = \n" << solver.final_approx_hessian() << std::endl;
47 |     std::cout << "approx_inv_hess = \n" << solver.final_approx_inverse_hessian() << std::endl;
48 | 
49 |     return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/include/LBFGS.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2016-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_LBFGS_H
  5 | #define LBFGSPP_LBFGS_H
  6 | 
  7 | #include <Eigen/Core>
  8 | #include "LBFGSpp/Param.h"
  9 | #include "LBFGSpp/BFGSMat.h"
 10 | #include "LBFGSpp/LineSearchBacktracking.h"
 11 | #include "LBFGSpp/LineSearchBracketing.h"
 12 | #include "LBFGSpp/LineSearchNocedalWright.h"
 13 | #include "LBFGSpp/LineSearchMoreThuente.h"
 14 | 
 15 | namespace LBFGSpp {
 16 | 
 17 | ///
 18 | /// L-BFGS solver for unconstrained numerical optimization
 19 | ///
 20 | template <typename Scalar,
 21 |           template <class> class LineSearch = LineSearchNocedalWright>
 22 | class LBFGSSolver
 23 | {
 24 | private:
 25 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 26 |     using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
 27 |     using MapVec = Eigen::Map<Vector>;
 28 | 
 29 |     const LBFGSParam<Scalar>& m_param;  // Parameters to control the LBFGS algorithm
 30 |     BFGSMat<Scalar> m_bfgs;             // Approximation to the Hessian matrix
 31 |     Vector m_fx;                        // History of the objective function values
 32 |     Vector m_xp;                        // Old x
 33 |     Vector m_grad;                      // New gradient
 34 |     Scalar m_gnorm;                     // Norm of the gradient
 35 |     Vector m_gradp;                     // Old gradient
 36 |     Vector m_drt;                       // Moving direction
 37 | 
 38 |     // Reset internal variables
 39 |     // n: dimension of the vector to be optimized
 40 |     inline void reset(int n)
 41 |     {
 42 |         const int m = m_param.m;
 43 |         m_bfgs.reset(n, m);
 44 |         m_xp.resize(n);
 45 |         m_grad.resize(n);
 46 |         m_gradp.resize(n);
 47 |         m_drt.resize(n);
 48 |         if (m_param.past > 0)
 49 |             m_fx.resize(m_param.past);
 50 |     }
 51 | 
 52 | public:
 53 |     ///
 54 |     /// Constructor for the L-BFGS solver.
 55 |     ///
 56 |     /// \param param An object of \ref LBFGSParam to store parameters for the
 57 |     ///        algorithm
 58 |     ///
 59 |     LBFGSSolver(const LBFGSParam<Scalar>& param) :
 60 |         m_param(param)
 61 |     {
 62 |         m_param.check_param();
 63 |     }
 64 | 
 65 |     ///
 66 |     /// Minimizing a multivariate function using the L-BFGS algorithm.
 67 |     /// Exceptions will be thrown if error occurs.
 68 |     ///
 69 |     /// \param f  A function object such that `f(x, grad)` returns the
 70 |     ///           objective function value at `x`, and overwrites `grad` with
 71 |     ///           the gradient.
 72 |     /// \param x  In: An initial guess of the optimal point. Out: The best point
 73 |     ///           found.
 74 |     /// \param fx Out: The objective function value at `x`.
 75 |     ///
 76 |     /// \return Number of iterations used.
 77 |     ///
 78 |     template <typename Foo>
 79 |     inline int minimize(Foo& f, Vector& x, Scalar& fx)
 80 |     {
 81 |         using std::abs;
 82 | 
 83 |         // Dimension of the vector
 84 |         const int n = x.size();
 85 |         reset(n);
 86 | 
 87 |         // The length of lag for objective function value to test convergence
 88 |         const int fpast = m_param.past;
 89 | 
 90 |         // Evaluate function and compute gradient
 91 |         fx = f(x, m_grad);
 92 |         m_gnorm = m_grad.norm();
 93 |         if (fpast > 0)
 94 |             m_fx[0] = fx;
 95 | 
 96 |         // std::cout << "x0 = " << x.transpose() << std::endl;
 97 |         // std::cout << "f(x0) = " << fx << ", ||grad|| = " << m_gnorm << std::endl << std::endl;
 98 | 
 99 |         // Early exit if the initial x is already a minimizer
100 |         if (m_gnorm <= m_param.epsilon || m_gnorm <= m_param.epsilon_rel * x.norm())
101 |         {
102 |             return 1;
103 |         }
104 | 
105 |         // Initial direction
106 |         m_drt.noalias() = -m_grad;
107 |         // Initial step size
108 |         Scalar step = Scalar(1) / m_drt.norm();
109 |         // Tolerance for s'y >= eps * (y'y)
110 |         constexpr Scalar eps = std::numeric_limits<Scalar>::epsilon();
111 |         // s and y vectors
112 |         Vector vecs(n), vecy(n);
113 | 
114 |         // Number of iterations used
115 |         int k = 1;
116 |         for (;;)
117 |         {
118 |             // std::cout << "Iter " << k << " begins" << std::endl << std::endl;
119 | 
120 |             // Save the curent x and gradient
121 |             m_xp.noalias() = x;
122 |             m_gradp.noalias() = m_grad;
123 |             Scalar dg = m_grad.dot(m_drt);
124 |             const Scalar step_max = m_param.max_step;
125 | 
126 |             // Line search to update x, fx and gradient
127 |             LineSearch<Scalar>::LineSearch(f, m_param, m_xp, m_drt, step_max, step, fx, m_grad, dg, x);
128 | 
129 |             // New gradient norm
130 |             m_gnorm = m_grad.norm();
131 | 
132 |             // std::cout << "Iter " << k << " finished line search" << std::endl;
133 |             // std::cout << "   x = " << x.transpose() << std::endl;
134 |             // std::cout << "   f(x) = " << fx << ", ||grad|| = " << m_gnorm << std::endl << std::endl;
135 | 
136 |             // Convergence test -- gradient
137 |             if (m_gnorm <= m_param.epsilon || m_gnorm <= m_param.epsilon_rel * x.norm())
138 |             {
139 |                 return k;
140 |             }
141 |             // Convergence test -- objective function value
142 |             if (fpast > 0)
143 |             {
144 |                 const Scalar fxd = m_fx[k % fpast];
145 |                 if (k >= fpast && abs(fxd - fx) <= m_param.delta * std::max(std::max(abs(fx), abs(fxd)), Scalar(1)))
146 |                     return k;
147 | 
148 |                 m_fx[k % fpast] = fx;
149 |             }
150 |             // Maximum number of iterations
151 |             if (m_param.max_iterations != 0 && k >= m_param.max_iterations)
152 |             {
153 |                 return k;
154 |             }
155 | 
156 |             // Update s and y
157 |             // s_{k+1} = x_{k+1} - x_k
158 |             // y_{k+1} = g_{k+1} - g_k
159 |             vecs.noalias() = x - m_xp;
160 |             vecy.noalias() = m_grad - m_gradp;
161 |             if (vecs.dot(vecy) > eps * vecy.squaredNorm())
162 |                 m_bfgs.add_correction(vecs, vecy);
163 | 
164 |             // Recursive formula to compute d = -H * g
165 |             m_bfgs.apply_Hv(m_grad, -Scalar(1), m_drt);
166 | 
167 |             // Reset step = 1.0 as initial guess for the next line search
168 |             step = Scalar(1);
169 |             k++;
170 |         }
171 | 
172 |         return k;
173 |     }
174 | 
175 |     ///
176 |     /// Returning the gradient vector on the last iterate.
177 |     /// Typically used to debug and test convergence.
178 |     /// Should only be called after the `minimize()` function.
179 |     ///
180 |     /// \return A const reference to the gradient vector.
181 |     ///
182 |     const Vector& final_grad() const { return m_grad; }
183 | 
184 |     ///
185 |     /// Returning the Euclidean norm of the final gradient.
186 |     ///
187 |     Scalar final_grad_norm() const { return m_gnorm; }
188 | 
189 |     ///
190 |     /// Returning the approximate Hessian matrix on the last iterate.
191 |     ///
192 |     Matrix final_approx_hessian() const { return m_bfgs.get_Bmat(); }
193 | 
194 |     ///
195 |     /// Returning the approximate inverse Hessian matrix on the last iterate.
196 |     ///
197 |     Matrix final_approx_inverse_hessian() const { return m_bfgs.get_Hmat(); }
198 | };
199 | 
200 | }  // namespace LBFGSpp
201 | 
202 | #endif  // LBFGSPP_LBFGS_H
203 | 


--------------------------------------------------------------------------------
/include/LBFGSB.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_LBFGSB_H
  5 | #define LBFGSPP_LBFGSB_H
  6 | 
  7 | #include <stdexcept>  // std::invalid_argument
  8 | #include <vector>
  9 | #include <Eigen/Core>
 10 | #include "LBFGSpp/Param.h"
 11 | #include "LBFGSpp/BFGSMat.h"
 12 | #include "LBFGSpp/Cauchy.h"
 13 | #include "LBFGSpp/SubspaceMin.h"
 14 | #include "LBFGSpp/LineSearchMoreThuente.h"
 15 | 
 16 | namespace LBFGSpp {
 17 | 
 18 | ///
 19 | /// L-BFGS-B solver for box-constrained numerical optimization
 20 | ///
 21 | template <typename Scalar,
 22 |           template <class> class LineSearch = LineSearchMoreThuente>
 23 | class LBFGSBSolver
 24 | {
 25 | private:
 26 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 27 |     using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
 28 |     using MapVec = Eigen::Map<Vector>;
 29 |     using IndexSet = std::vector<int>;
 30 | 
 31 |     const LBFGSBParam<Scalar>& m_param;  // Parameters to control the LBFGS algorithm
 32 |     BFGSMat<Scalar, true> m_bfgs;        // Approximation to the Hessian matrix
 33 |     Vector m_fx;                         // History of the objective function values
 34 |     Vector m_xp;                         // Old x
 35 |     Vector m_grad;                       // New gradient
 36 |     Scalar m_projgnorm;                  // Projected gradient norm
 37 |     Vector m_gradp;                      // Old gradient
 38 |     Vector m_drt;                        // Moving direction
 39 | 
 40 |     // Reset internal variables
 41 |     // n: dimension of the vector to be optimized
 42 |     inline void reset(int n)
 43 |     {
 44 |         const int m = m_param.m;
 45 |         m_bfgs.reset(n, m);
 46 |         m_xp.resize(n);
 47 |         m_grad.resize(n);
 48 |         m_gradp.resize(n);
 49 |         m_drt.resize(n);
 50 |         if (m_param.past > 0)
 51 |             m_fx.resize(m_param.past);
 52 |     }
 53 | 
 54 |     // Project the vector x to the bound constraint set
 55 |     static void force_bounds(Vector& x, const Vector& lb, const Vector& ub)
 56 |     {
 57 |         x.noalias() = x.cwiseMax(lb).cwiseMin(ub);
 58 |     }
 59 | 
 60 |     // Norm of the projected gradient
 61 |     // ||P(x-g, l, u) - x||_inf
 62 |     static Scalar proj_grad_norm(const Vector& x, const Vector& g, const Vector& lb, const Vector& ub)
 63 |     {
 64 |         return ((x - g).cwiseMax(lb).cwiseMin(ub) - x).cwiseAbs().maxCoeff();
 65 |     }
 66 | 
 67 |     // The maximum step size alpha such that x0 + alpha * d stays within the bounds
 68 |     static Scalar max_step_size(const Vector& x0, const Vector& drt, const Vector& lb, const Vector& ub)
 69 |     {
 70 |         const int n = x0.size();
 71 |         Scalar step = std::numeric_limits<Scalar>::infinity();
 72 | 
 73 |         for (int i = 0; i < n; i++)
 74 |         {
 75 |             if (drt[i] > Scalar(0))
 76 |             {
 77 |                 step = std::min(step, (ub[i] - x0[i]) / drt[i]);
 78 |             }
 79 |             else if (drt[i] < Scalar(0))
 80 |             {
 81 |                 step = std::min(step, (lb[i] - x0[i]) / drt[i]);
 82 |             }
 83 |         }
 84 | 
 85 |         return step;
 86 |     }
 87 | 
 88 | public:
 89 |     ///
 90 |     /// Constructor for the L-BFGS-B solver.
 91 |     ///
 92 |     /// \param param An object of \ref LBFGSParam to store parameters for the
 93 |     ///        algorithm
 94 |     ///
 95 |     LBFGSBSolver(const LBFGSBParam<Scalar>& param) :
 96 |         m_param(param)
 97 |     {
 98 |         m_param.check_param();
 99 |     }
100 | 
101 |     ///
102 |     /// Minimizing a multivariate function subject to box constraints, using the L-BFGS-B algorithm.
103 |     /// Exceptions will be thrown if error occurs.
104 |     ///
105 |     /// \param f  A function object such that `f(x, grad)` returns the
106 |     ///           objective function value at `x`, and overwrites `grad` with
107 |     ///           the gradient.
108 |     /// \param x  In: An initial guess of the optimal point. Out: The best point
109 |     ///           found.
110 |     /// \param fx Out: The objective function value at `x`.
111 |     /// \param lb Lower bounds for `x`.
112 |     /// \param ub Upper bounds for `x`.
113 |     ///
114 |     /// \return Number of iterations used.
115 |     ///
116 |     template <typename Foo>
117 |     inline int minimize(Foo& f, Vector& x, Scalar& fx, const Vector& lb, const Vector& ub)
118 |     {
119 |         using std::abs;
120 | 
121 |         // Dimension of the vector
122 |         const int n = x.size();
123 |         if (lb.size() != n || ub.size() != n)
124 |             throw std::invalid_argument("'lb' and 'ub' must have the same size as 'x'");
125 | 
126 |         // Check whether the initial vector is within the bounds
127 |         // If not, project to the feasible set
128 |         force_bounds(x, lb, ub);
129 | 
130 |         // Initialization
131 |         reset(n);
132 | 
133 |         // The length of lag for objective function value to test convergence
134 |         const int fpast = m_param.past;
135 | 
136 |         // Evaluate function and compute gradient
137 |         fx = f(x, m_grad);
138 |         m_projgnorm = proj_grad_norm(x, m_grad, lb, ub);
139 |         if (fpast > 0)
140 |             m_fx[0] = fx;
141 | 
142 |         // std::cout << "x0 = " << x.transpose() << std::endl;
143 |         // std::cout << "f(x0) = " << fx << ", ||proj_grad|| = " << m_projgnorm << std::endl << std::endl;
144 | 
145 |         // Early exit if the initial x is already a minimizer
146 |         if (m_projgnorm <= m_param.epsilon || m_projgnorm <= m_param.epsilon_rel * x.norm())
147 |         {
148 |             return 1;
149 |         }
150 | 
151 |         // Compute generalized Cauchy point
152 |         Vector xcp(n), vecc;
153 |         IndexSet newact_set, fv_set;
154 |         Cauchy<Scalar>::get_cauchy_point(m_bfgs, x, m_grad, lb, ub, xcp, vecc, newact_set, fv_set);
155 | 
156 |         /* Vector gcp(n);
157 |         Scalar fcp = f(xcp, gcp);
158 |         Scalar projgcpnorm = proj_grad_norm(xcp, gcp, lb, ub);
159 |         std::cout << "xcp = " << xcp.transpose() << std::endl;
160 |         std::cout << "f(xcp) = " << fcp << ", ||proj_grad|| = " << projgcpnorm << std::endl << std::endl; */
161 | 
162 |         // Initial direction
163 |         m_drt.noalias() = xcp - x;
164 |         m_drt.normalize();
165 |         // Tolerance for s'y >= eps * (y'y)
166 |         constexpr Scalar eps = std::numeric_limits<Scalar>::epsilon();
167 |         // s and y vectors
168 |         Vector vecs(n), vecy(n);
169 |         // Number of iterations used
170 |         int k = 1;
171 |         for (;;)
172 |         {
173 |             // Save the curent x and gradient
174 |             m_xp.noalias() = x;
175 |             m_gradp.noalias() = m_grad;
176 |             Scalar dg = m_grad.dot(m_drt);
177 | 
178 |             // Maximum step size to make x feasible
179 |             Scalar step_max = max_step_size(x, m_drt, lb, ub);
180 | 
181 |             // In some cases, the direction returned by the subspace minimization procedure
182 |             // in the previous iteration is pathological, leading to issues such as
183 |             // step_max~=0 and dg>=0. If this happens, we use xcp-x as the search direction,
184 |             // and reset the BFGS matrix. This is because xsm (the subspace minimizer)
185 |             // heavily depends on the BFGS matrix. If xsm is corrupted, then we may suspect
186 |             // there is something wrong in the BFGS matrix, and it is safer to reset the matrix.
187 |             // In contrast, xcp is obtained from a line search, which tends to be more robust
188 |             if (dg >= Scalar(0) || step_max <= m_param.min_step)
189 |             {
190 |                // Reset search direction
191 |                 m_drt.noalias() = xcp - x;
192 |                 // Reset BFGS matrix
193 |                 m_bfgs.reset(n, m_param.m);
194 |                 // Recompute dg and step_max
195 |                 dg = m_grad.dot(m_drt);
196 |                 step_max = max_step_size(x, m_drt, lb, ub);
197 |             }
198 | 
199 |             // Line search to update x, fx and gradient
200 |             step_max = std::min(m_param.max_step, step_max);
201 |             Scalar step = Scalar(1);
202 |             step = std::min(step, step_max);
203 |             LineSearch<Scalar>::LineSearch(f, m_param, m_xp, m_drt, step_max, step, fx, m_grad, dg, x);
204 | 
205 |             // New projected gradient norm
206 |             m_projgnorm = proj_grad_norm(x, m_grad, lb, ub);
207 | 
208 |             /* std::cout << "** Iteration " << k << std::endl;
209 |             std::cout << "   x = " << x.transpose() << std::endl;
210 |             std::cout << "   f(x) = " << fx << ", ||proj_grad|| = " << m_projgnorm << std::endl << std::endl; */
211 | 
212 |             // Convergence test -- gradient
213 |             if (m_projgnorm <= m_param.epsilon || m_projgnorm <= m_param.epsilon_rel * x.norm())
214 |             {
215 |                 return k;
216 |             }
217 |             // Convergence test -- objective function value
218 |             if (fpast > 0)
219 |             {
220 |                 const Scalar fxd = m_fx[k % fpast];
221 |                 if (k >= fpast && abs(fxd - fx) <= m_param.delta * std::max(std::max(abs(fx), abs(fxd)), Scalar(1)))
222 |                     return k;
223 | 
224 |                 m_fx[k % fpast] = fx;
225 |             }
226 |             // Maximum number of iterations
227 |             if (m_param.max_iterations != 0 && k >= m_param.max_iterations)
228 |             {
229 |                 return k;
230 |             }
231 | 
232 |             // Update s and y
233 |             // s_{k+1} = x_{k+1} - x_k
234 |             // y_{k+1} = g_{k+1} - g_k
235 |             vecs.noalias() = x - m_xp;
236 |             vecy.noalias() = m_grad - m_gradp;
237 |             if (vecs.dot(vecy) > eps * vecy.squaredNorm())
238 |                 m_bfgs.add_correction(vecs, vecy);
239 | 
240 |             force_bounds(x, lb, ub);
241 |             Cauchy<Scalar>::get_cauchy_point(m_bfgs, x, m_grad, lb, ub, xcp, vecc, newact_set, fv_set);
242 | 
243 |             /*Vector gcp(n);
244 |             Scalar fcp = f(xcp, gcp);
245 |             Scalar projgcpnorm = proj_grad_norm(xcp, gcp, lb, ub);
246 |             std::cout << "xcp = " << xcp.transpose() << std::endl;
247 |             std::cout << "f(xcp) = " << fcp << ", ||proj_grad|| = " << projgcpnorm << std::endl << std::endl;*/
248 | 
249 |             SubspaceMin<Scalar>::subspace_minimize(m_bfgs, x, xcp, m_grad, lb, ub,
250 |                                                    vecc, newact_set, fv_set, m_param.max_submin, m_drt);
251 | 
252 |             /*Vector gsm(n);
253 |             Scalar fsm = f(x + m_drt, gsm);
254 |             Scalar projgsmnorm = proj_grad_norm(x + m_drt, gsm, lb, ub);
255 |             std::cout << "xsm = " << (x + m_drt).transpose() << std::endl;
256 |             std::cout << "f(xsm) = " << fsm << ", ||proj_grad|| = " << projgsmnorm << std::endl << std::endl;*/
257 | 
258 |             k++;
259 |         }
260 | 
261 |         return k;
262 |     }
263 | 
264 |     ///
265 |     /// Returning the gradient vector on the last iterate.
266 |     /// Typically used to debug and test convergence.
267 |     /// Should only be called after the `minimize()` function.
268 |     ///
269 |     /// \return A const reference to the gradient vector.
270 |     ///
271 |     const Vector& final_grad() const { return m_grad; }
272 | 
273 |     ///
274 |     /// Returning the infinity norm of the final projected gradient.
275 |     /// The projected gradient is defined as \f$P(x-g,l,u)-x\f$, where \f$P(v,l,u)\f$ stands for
276 |     /// the projection of a vector \f$v\f$ onto the box specified by the lower bound vector \f$l\f$ and
277 |     /// upper bound vector \f$u\f$.
278 |     ///
279 |     Scalar final_grad_norm() const { return m_projgnorm; }
280 | };
281 | 
282 | }  // namespace LBFGSpp
283 | 
284 | #endif  // LBFGSPP_LBFGSB_H
285 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/BFGSMat.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_BFGS_MAT_H
  5 | #define LBFGSPP_BFGS_MAT_H
  6 | 
  7 | #include <vector>
  8 | #include <Eigen/Core>
  9 | #include <Eigen/LU>
 10 | #include "BKLDLT.h"
 11 | 
 12 | /// \cond
 13 | 
 14 | namespace LBFGSpp {
 15 | 
 16 | //
 17 | // An *implicit* representation of the BFGS approximation to the Hessian matrix
 18 | //
 19 | // B = theta * I - W * M * W' -- approximation to Hessian matrix, see [2]
 20 | // H = inv(B)                 -- approximation to inverse Hessian matrix, see [2]
 21 | //
 22 | // Reference:
 23 | // [1] D. C. Liu and J. Nocedal (1989). On the limited memory BFGS method for large scale optimization.
 24 | // [2] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization.
 25 | //
 26 | template <typename Scalar, bool LBFGSB = false>
 27 | class BFGSMat
 28 | {
 29 | private:
 30 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 31 |     using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
 32 |     using RefConstVec = Eigen::Ref<const Vector>;
 33 |     using IndexSet = std::vector<int>;
 34 | 
 35 |     int m_m;         // Maximum number of correction vectors
 36 |     Scalar m_theta;  // theta * I is the initial approximation to the Hessian matrix
 37 |     Matrix m_s;      // History of the s vectors
 38 |     Matrix m_y;      // History of the y vectors
 39 |     Vector m_ys;     // History of the s'y values
 40 |     Vector m_alpha;  // Temporary values used in computing H * v
 41 |     int m_ncorr;     // Number of correction vectors in the history, m_ncorr <= m
 42 |     int m_ptr;       // A Pointer to locate the most recent history, 1 <= m_ptr <= m
 43 |                      // Details: s and y vectors are stored in cyclic order.
 44 |                      //          For example, if the current s-vector is stored in m_s[, m-1],
 45 |                      //          then in the next iteration m_s[, 0] will be overwritten.
 46 |                      //          m_s[, m_ptr-1] points to the most recent history (if ncorr > 0),
 47 |                      //          and m_s[, m_ptr % m] points to the location that will be
 48 |                      //          overwritten next time.
 49 | 
 50 |     //========== The following members are only used in L-BFGS-B algorithm ==========//
 51 |     Matrix m_permMinv;             // Permutated M inverse
 52 |     BKLDLT<Scalar> m_permMsolver;  // Represents the permutated M matrix
 53 | 
 54 | public:
 55 |     // Constructor
 56 |     BFGSMat() {}
 57 | 
 58 |     // Reset internal variables
 59 |     // n: dimension of the vector to be optimized
 60 |     // m: maximum number of corrections to approximate the Hessian matrix
 61 |     inline void reset(int n, int m)
 62 |     {
 63 |         m_m = m;
 64 |         m_theta = Scalar(1);
 65 |         m_s.resize(n, m);
 66 |         m_y.resize(n, m);
 67 |         m_ys.resize(m);
 68 |         m_alpha.resize(m);
 69 |         m_ncorr = 0;
 70 |         m_ptr = m;  // This makes sure that m_ptr % m == 0 in the first step
 71 | 
 72 |         if (LBFGSB)
 73 |         {
 74 |             m_permMinv.resize(2 * m, 2 * m);
 75 |             m_permMinv.setZero();
 76 |             m_permMinv.diagonal().setOnes();
 77 |         }
 78 |     }
 79 | 
 80 |     // Add correction vectors to the BFGS matrix
 81 |     inline void add_correction(const RefConstVec& s, const RefConstVec& y)
 82 |     {
 83 |         const int loc = m_ptr % m_m;
 84 | 
 85 |         m_s.col(loc).noalias() = s;
 86 |         m_y.col(loc).noalias() = y;
 87 | 
 88 |         // ys = y's = 1/rho
 89 |         const Scalar ys = m_s.col(loc).dot(m_y.col(loc));
 90 |         m_ys[loc] = ys;
 91 | 
 92 |         m_theta = m_y.col(loc).squaredNorm() / ys;
 93 | 
 94 |         if (m_ncorr < m_m)
 95 |             m_ncorr++;
 96 | 
 97 |         m_ptr = loc + 1;
 98 | 
 99 |         if (LBFGSB)
100 |         {
101 |             // Minv = [-D         L']
102 |             //        [ L  theta*S'S]
103 | 
104 |             // Copy -D
105 |             // Let S=[s[0], ..., s[m-1]], Y=[y[0], ..., y[m-1]]
106 |             // D = [s[0]'y[0], ..., s[m-1]'y[m-1]]
107 |             m_permMinv(loc, loc) = -ys;
108 | 
109 |             // Update S'S
110 |             // We only store S'S in Minv, and multiply theta when LU decomposition is performed
111 |             Vector Ss = m_s.leftCols(m_ncorr).transpose() * m_s.col(loc);
112 |             m_permMinv.block(m_m + loc, m_m, 1, m_ncorr).noalias() = Ss.transpose();
113 |             m_permMinv.block(m_m, m_m + loc, m_ncorr, 1).noalias() = Ss;
114 | 
115 |             // Compute L
116 |             // L = [          0                                     ]
117 |             //     [  s[1]'y[0]             0                       ]
118 |             //     [  s[2]'y[0]     s[2]'y[1]                       ]
119 |             //     ...
120 |             //     [s[m-1]'y[0] ... ... ... ... ... s[m-1]'y[m-2]  0]
121 |             //
122 |             // L_next = [        0                                   ]
123 |             //          [s[2]'y[1]             0                     ]
124 |             //          [s[3]'y[1]     s[3]'y[2]                     ]
125 |             //          ...
126 |             //          [s[m]'y[1] ... ... ... ... ... s[m]'y[m-1]  0]
127 |             const int len = m_ncorr - 1;
128 |             // First zero out the column of oldest y
129 |             if (m_ncorr >= m_m)
130 |                 m_permMinv.block(m_m, loc, m_m, 1).setZero();
131 |             // Compute the row associated with new s
132 |             // The current row is loc
133 |             // End with column (loc + m - 1) % m
134 |             // Length is len
135 |             int yloc = (loc + m_m - 1) % m_m;
136 |             for (int i = 0; i < len; i++)
137 |             {
138 |                 m_permMinv(m_m + loc, yloc) = m_s.col(loc).dot(m_y.col(yloc));
139 |                 yloc = (yloc + m_m - 1) % m_m;
140 |             }
141 | 
142 |             // Matrix LDLT factorization
143 |             m_permMinv.block(m_m, m_m, m_m, m_m) *= m_theta;
144 |             m_permMsolver.compute(m_permMinv);
145 |             m_permMinv.block(m_m, m_m, m_m, m_m) /= m_theta;
146 |         }
147 |     }
148 | 
149 |     // Explicitly form the B matrix
150 |     inline Matrix get_Bmat() const
151 |     {
152 |         // Initial approximation theta * I
153 |         const int n = m_s.rows();
154 |         Matrix B = m_theta * Matrix::Identity(n, n);
155 |         if (m_ncorr < 1)
156 |             return B;
157 | 
158 |         // Construct W matrix, W = [Y, theta * S]
159 |         // Y = [y0, y1, ..., yc]
160 |         // S = [s0, s1, ..., sc]
161 |         // We first set W = [Y, S], since later we still need Y and S matrices
162 |         // After computing Minv, we rescale the S part in W
163 |         Matrix W(n, 2 * m_ncorr);
164 |         // r = m_ptr - 1 points to the most recent element,
165 |         // (r + 1) % m_ncorr points to the oldest element
166 |         int j = m_ptr % m_ncorr;
167 |         for (int i = 0; i < m_ncorr; i++)
168 |         {
169 |             W.col(i).noalias() = m_y.col(j);
170 |             W.col(m_ncorr + i).noalias() = m_s.col(j);
171 |             j = (j + 1) % m_m;
172 |         }
173 |         // Now Y = W[:, :c], S = W[:, c:]
174 | 
175 |         // Construct Minv matrix, Minv = [-D  L'         ]
176 |         //                               [ L  theta * S'S]
177 | 
178 |         // D = diag(y0's0, ..., yc'sc)
179 |         Matrix Minv(2 * m_ncorr, 2 * m_ncorr);
180 |         Minv.topLeftCorner(m_ncorr, m_ncorr).setZero();
181 |         Vector ys = W.leftCols(m_ncorr).cwiseProduct(W.rightCols(m_ncorr)).colwise().sum().transpose();
182 |         Minv.diagonal().head(m_ncorr).noalias() = -ys;
183 |         // L = [          0                                     ]
184 |         //     [  s[1]'y[0]             0                       ]
185 |         //     [  s[2]'y[0]     s[2]'y[1]                       ]
186 |         //     ...
187 |         //     [s[c-1]'y[0] ... ... ... ... ... s[c-1]'y[c-2]  0]
188 |         Minv.bottomLeftCorner(m_ncorr, m_ncorr).setZero();
189 |         for (int i = 0; i < m_ncorr - 1; i++)
190 |         {
191 |             // Number of terms for this column
192 |             const int nterm = m_ncorr - i - 1;
193 |             // S[:, -nterm:]'Y[:, j]
194 |             Minv.col(i).tail(nterm).noalias() = W.rightCols(nterm).transpose() * W.col(i);
195 |         }
196 |         // The symmetric block
197 |         Minv.topRightCorner(m_ncorr, m_ncorr).noalias() = Minv.bottomLeftCorner(m_ncorr, m_ncorr).transpose();
198 |         // theta * S'S
199 |         Minv.bottomRightCorner(m_ncorr, m_ncorr).noalias() = m_theta * W.rightCols(m_ncorr).transpose() * W.rightCols(m_ncorr);
200 | 
201 |         // Set the true W matrix
202 |         W.rightCols(m_ncorr).array() *= m_theta;
203 | 
204 |         // Compute B = theta * I - W * M * W'
205 |         Eigen::PartialPivLU<Matrix> M_solver(Minv);
206 |         B.noalias() -= W * M_solver.solve(W.transpose());
207 |         return B;
208 |     }
209 | 
210 |     // Explicitly form the H matrix
211 |     inline Matrix get_Hmat() const
212 |     {
213 |         // Initial approximation 1/theta * I
214 |         const int n = m_s.rows();
215 |         Matrix H = (Scalar(1) / m_theta) * Matrix::Identity(n, n);
216 |         if (m_ncorr < 1)
217 |             return H;
218 | 
219 |         // Construct W matrix, W = [1/theta * Y, S]
220 |         // Y = [y0, y1, ..., yc]
221 |         // S = [s0, s1, ..., sc]
222 |         // We first set W = [Y, S], since later we still need Y and S matrices
223 |         // After computing M, we rescale the Y part in W
224 |         Matrix W(n, 2 * m_ncorr);
225 |         // p = m_ptr - 1 points to the most recent element,
226 |         // (p + 1) % m_ncorr points to the oldest element
227 |         int j = m_ptr % m_ncorr;
228 |         for (int i = 0; i < m_ncorr; i++)
229 |         {
230 |             W.col(i).noalias() = m_y.col(j);
231 |             W.col(m_ncorr + i).noalias() = m_s.col(j);
232 |             j = (j + 1) % m_m;
233 |         }
234 |         // Now Y = W[:, :c], S = W[:, c:]
235 | 
236 |         // Construct M matrix, M = [        0                           -inv(R) ]
237 |         //                         [ -inv(R)'  inv(R)'(D + 1/theta * Y'Y)inv(R) ]
238 |         // D = diag(y0's0, ..., yc'sc)
239 |         Matrix M(2 * m_ncorr, 2 * m_ncorr);
240 |         // First use M[:c, :c] to store R
241 |         // R = [s[0]'y[0]  s[0]'y[1] ...    s[0]'y[c-1] ]
242 |         //     [        0  s[1]'y[1] ...    s[1]'y[c-1] ]
243 |         //     ...
244 |         //     [        0          0 ...  s[c-1]'y[c-1] ]
245 |         for (int i = 0; i < m_ncorr; i++)
246 |         {
247 |             M.col(i).head(i + 1).noalias() = W.middleCols(m_ncorr, i + 1).transpose() * W.col(i);
248 |         }
249 |         // Compute inv(R)
250 |         Matrix Rinv = M.topLeftCorner(m_ncorr, m_ncorr).template triangularView<Eigen::Upper>().solve(Matrix::Identity(m_ncorr, m_ncorr));
251 |         // Zero out the top left block
252 |         M.topLeftCorner(m_ncorr, m_ncorr).setZero();
253 |         // Set the top right block
254 |         M.topRightCorner(m_ncorr, m_ncorr).noalias() = -Rinv;
255 |         // The symmetric block
256 |         M.bottomLeftCorner(m_ncorr, m_ncorr).noalias() = -Rinv.transpose();
257 |         // 1/theta * Y'Y
258 |         Matrix block = (Scalar(1) / m_theta) * W.leftCols(m_ncorr).transpose() * W.leftCols(m_ncorr);
259 |         // D + 1/theta * Y'Y
260 |         Vector ys = W.leftCols(m_ncorr).cwiseProduct(W.rightCols(m_ncorr)).colwise().sum().transpose();
261 |         block.diagonal().array() += ys.array();
262 |         // The bottom right block
263 |         M.bottomRightCorner(m_ncorr, m_ncorr).noalias() = Rinv.transpose() * block * Rinv;
264 | 
265 |         // Set the true W matrix
266 |         W.leftCols(m_ncorr).array() *= (Scalar(1) / m_theta);
267 | 
268 |         // Compute H = 1/theta * I + W * M * W'
269 |         H.noalias() += W * M * W.transpose();
270 |         return H;
271 |     }
272 | 
273 |     // Recursive formula to compute a * H * v, where a is a scalar, and v is [n x 1]
274 |     // H0 = (1/theta) * I is the initial approximation to H
275 |     // Algorithm 7.4 of Nocedal, J., & Wright, S. (2006). Numerical optimization.
276 |     inline void apply_Hv(const Vector& v, const Scalar& a, Vector& res)
277 |     {
278 |         res.resize(v.size());
279 | 
280 |         // L-BFGS two-loop recursion
281 | 
282 |         // Loop 1
283 |         res.noalias() = a * v;
284 |         int j = m_ptr % m_m;
285 |         for (int i = 0; i < m_ncorr; i++)
286 |         {
287 |             j = (j + m_m - 1) % m_m;
288 |             m_alpha[j] = m_s.col(j).dot(res) / m_ys[j];
289 |             res.noalias() -= m_alpha[j] * m_y.col(j);
290 |         }
291 | 
292 |         // Apply initial H0
293 |         res /= m_theta;
294 | 
295 |         // Loop 2
296 |         for (int i = 0; i < m_ncorr; i++)
297 |         {
298 |             const Scalar beta = m_y.col(j).dot(res) / m_ys[j];
299 |             res.noalias() += (m_alpha[j] - beta) * m_s.col(j);
300 |             j = (j + 1) % m_m;
301 |         }
302 |     }
303 | 
304 |     //========== The following functions are only used in L-BFGS-B algorithm ==========//
305 | 
306 |     // Return the value of theta
307 |     inline Scalar theta() const { return m_theta; }
308 | 
309 |     // Return current number of correction vectors
310 |     inline int num_corrections() const { return m_ncorr; }
311 | 
312 |     // W = [Y, theta * S]
313 |     // W [n x (2*ncorr)], v [n x 1], res [(2*ncorr) x 1]
314 |     // res preserves the ordering of Y and S columns
315 |     inline void apply_Wtv(const Vector& v, Vector& res) const
316 |     {
317 |         res.resize(2 * m_ncorr);
318 |         res.head(m_ncorr).noalias() = m_y.leftCols(m_ncorr).transpose() * v;
319 |         res.tail(m_ncorr).noalias() = m_theta * m_s.leftCols(m_ncorr).transpose() * v;
320 |     }
321 | 
322 |     // The b-th row of the W matrix
323 |     // Preserves the ordering of Y and S columns
324 |     // Return as a column vector
325 |     inline Vector Wb(int b) const
326 |     {
327 |         Vector res(2 * m_ncorr);
328 |         for (int j = 0; j < m_ncorr; j++)
329 |         {
330 |             res[j] = m_y(b, j);
331 |             res[m_ncorr + j] = m_s(b, j);
332 |         }
333 |         res.tail(m_ncorr) *= m_theta;
334 |         return res;
335 |     }
336 | 
337 |     // Extract rows of W
338 |     inline Matrix Wb(const IndexSet& b) const
339 |     {
340 |         const int nb = b.size();
341 |         const int* bptr = b.data();
342 |         Matrix res(nb, 2 * m_ncorr);
343 | 
344 |         for (int j = 0; j < m_ncorr; j++)
345 |         {
346 |             const Scalar* Yptr = &m_y(0, j);
347 |             const Scalar* Sptr = &m_s(0, j);
348 |             Scalar* resYptr = res.data() + j * nb;
349 |             Scalar* resSptr = resYptr + m_ncorr * nb;
350 |             for (int i = 0; i < nb; i++)
351 |             {
352 |                 const int row = bptr[i];
353 |                 resYptr[i] = Yptr[row];
354 |                 resSptr[i] = Sptr[row];
355 |             }
356 |         }
357 |         return res;
358 |     }
359 | 
360 |     // M is [(2*ncorr) x (2*ncorr)], v is [(2*ncorr) x 1]
361 |     inline void apply_Mv(const Vector& v, Vector& res) const
362 |     {
363 |         res.resize(2 * m_ncorr);
364 |         if (m_ncorr < 1)
365 |             return;
366 | 
367 |         Vector vpadding = Vector::Zero(2 * m_m);
368 |         vpadding.head(m_ncorr).noalias() = v.head(m_ncorr);
369 |         vpadding.segment(m_m, m_ncorr).noalias() = v.tail(m_ncorr);
370 | 
371 |         // Solve linear equation
372 |         m_permMsolver.solve_inplace(vpadding);
373 | 
374 |         res.head(m_ncorr).noalias() = vpadding.head(m_ncorr);
375 |         res.tail(m_ncorr).noalias() = vpadding.segment(m_m, m_ncorr);
376 |     }
377 | 
378 |     // Compute W'Pv
379 |     // W [n x (2*ncorr)], v [nP x 1], res [(2*ncorr) x 1]
380 |     // res preserves the ordering of Y and S columns
381 |     // Returns false if the result is known to be zero
382 |     inline bool apply_WtPv(const IndexSet& P_set, const Vector& v, Vector& res, bool test_zero = false) const
383 |     {
384 |         const int* Pptr = P_set.data();
385 |         const Scalar* vptr = v.data();
386 |         int nP = P_set.size();
387 | 
388 |         // Remove zeros in v to save computation
389 |         IndexSet P_reduced;
390 |         std::vector<Scalar> v_reduced;
391 |         if (test_zero)
392 |         {
393 |             P_reduced.reserve(nP);
394 |             for (int i = 0; i < nP; i++)
395 |             {
396 |                 if (vptr[i] != Scalar(0))
397 |                 {
398 |                     P_reduced.push_back(Pptr[i]);
399 |                     v_reduced.push_back(vptr[i]);
400 |                 }
401 |             }
402 |             Pptr = P_reduced.data();
403 |             vptr = v_reduced.data();
404 |             nP = P_reduced.size();
405 |         }
406 | 
407 |         res.resize(2 * m_ncorr);
408 |         if (m_ncorr < 1 || nP < 1)
409 |         {
410 |             res.setZero();
411 |             return false;
412 |         }
413 | 
414 |         for (int j = 0; j < m_ncorr; j++)
415 |         {
416 |             Scalar resy = Scalar(0), ress = Scalar(0);
417 |             const Scalar* yptr = &m_y(0, j);
418 |             const Scalar* sptr = &m_s(0, j);
419 |             for (int i = 0; i < nP; i++)
420 |             {
421 |                 const int row = Pptr[i];
422 |                 resy += yptr[row] * vptr[i];
423 |                 ress += sptr[row] * vptr[i];
424 |             }
425 |             res[j] = resy;
426 |             res[m_ncorr + j] = ress;
427 |         }
428 |         res.tail(m_ncorr) *= m_theta;
429 |         return true;
430 |     }
431 | 
432 |     // Compute s * P'WMv
433 |     // Assume that v[2*ncorr x 1] has the same ordering (permutation) as W and M
434 |     // Returns false if the result is known to be zero
435 |     inline bool apply_PtWMv(const IndexSet& P_set, const Vector& v, Vector& res, const Scalar& scale) const
436 |     {
437 |         const int nP = P_set.size();
438 |         res.resize(nP);
439 |         res.setZero();
440 |         if (m_ncorr < 1 || nP < 1)
441 |             return false;
442 | 
443 |         Vector Mv;
444 |         apply_Mv(v, Mv);
445 |         // WP * Mv
446 |         Mv.tail(m_ncorr) *= m_theta;
447 |         for (int j = 0; j < m_ncorr; j++)
448 |         {
449 |             const Scalar* yptr = &m_y(0, j);
450 |             const Scalar* sptr = &m_s(0, j);
451 |             const Scalar Mvy = Mv[j], Mvs = Mv[m_ncorr + j];
452 |             for (int i = 0; i < nP; i++)
453 |             {
454 |                 const int row = P_set[i];
455 |                 res[i] += Mvy * yptr[row] + Mvs * sptr[row];
456 |             }
457 |         }
458 |         res *= scale;
459 |         return true;
460 |     }
461 |     // If the P'W matrix has been explicitly formed, do a direct matrix multiplication
462 |     inline bool apply_PtWMv(const Matrix& WP, const Vector& v, Vector& res, const Scalar& scale) const
463 |     {
464 |         const int nP = WP.rows();
465 |         res.resize(nP);
466 |         if (m_ncorr < 1 || nP < 1)
467 |         {
468 |             res.setZero();
469 |             return false;
470 |         }
471 | 
472 |         Vector Mv;
473 |         apply_Mv(v, Mv);
474 |         // WP * Mv
475 |         Mv.tail(m_ncorr) *= m_theta;
476 |         res.noalias() = scale * (WP * Mv);
477 |         return true;
478 |     }
479 | 
480 |     // Compute F'BAb = -(F'W)M(W'AA'd)
481 |     // W'd is known, and AA'+FF'=I, so W'AA'd = W'd - W'FF'd
482 |     // Usually d contains many zeros, so we fist compute number of nonzero elements in A set and F set,
483 |     // denoted as nnz_act and nnz_fv, respectively
484 |     // If nnz_act is smaller, compute W'AA'd = WA' (A'd) directly
485 |     // If nnz_fv is smaller, compute W'AA'd = W'd - WF' * (F'd)
486 |     inline void compute_FtBAb(
487 |         const Matrix& WF, const IndexSet& fv_set, const IndexSet& newact_set, const Vector& Wd, const Vector& drt,
488 |         Vector& res) const
489 |     {
490 |         const int nact = newact_set.size();
491 |         const int nfree = WF.rows();
492 |         res.resize(nfree);
493 |         if (m_ncorr < 1 || nact < 1 || nfree < 1)
494 |         {
495 |             res.setZero();
496 |             return;
497 |         }
498 | 
499 |         // W'AA'd
500 |         Vector rhs(2 * m_ncorr);
501 |         if (nact <= nfree)
502 |         {
503 |             // Construct A'd
504 |             Vector Ad(nfree);
505 |             for (int i = 0; i < nact; i++)
506 |                 Ad[i] = drt[newact_set[i]];
507 |             apply_WtPv(newact_set, Ad, rhs);
508 |         }
509 |         else
510 |         {
511 |             // Construct F'd
512 |             Vector Fd(nfree);
513 |             for (int i = 0; i < nfree; i++)
514 |                 Fd[i] = drt[fv_set[i]];
515 |             // Compute W'AA'd = W'd - WF' * (F'd)
516 |             rhs.noalias() = WF.transpose() * Fd;
517 |             rhs.tail(m_ncorr) *= m_theta;
518 |             rhs.noalias() = Wd - rhs;
519 |         }
520 | 
521 |         apply_PtWMv(WF, rhs, res, Scalar(-1));
522 |     }
523 | 
524 |     // Compute inv(P'BP) * v
525 |     // P represents an index set
526 |     // inv(P'BP) * v = v / theta + WP * inv(inv(M) - WP' * WP / theta) * WP' * v / theta^2
527 |     //
528 |     // v is [nP x 1]
529 |     inline void solve_PtBP(const Matrix& WP, const Vector& v, Vector& res) const
530 |     {
531 |         const int nP = WP.rows();
532 |         res.resize(nP);
533 |         if (m_ncorr < 1 || nP < 1)
534 |         {
535 |             res.noalias() = v / m_theta;
536 |             return;
537 |         }
538 | 
539 |         // Compute the matrix in the middle (only the lower triangular part is needed)
540 |         // Remember that W = [Y, theta * S], but we do not store theta in WP
541 |         Matrix mid(2 * m_ncorr, 2 * m_ncorr);
542 |         // [0:(ncorr - 1), 0:(ncorr - 1)]
543 |         for (int j = 0; j < m_ncorr; j++)
544 |         {
545 |             mid.col(j).segment(j, m_ncorr - j).noalias() = m_permMinv.col(j).segment(j, m_ncorr - j) -
546 |                 WP.block(0, j, nP, m_ncorr - j).transpose() * WP.col(j) / m_theta;
547 |         }
548 |         // [ncorr:(2 * ncorr - 1), 0:(ncorr - 1)]
549 |         mid.block(m_ncorr, 0, m_ncorr, m_ncorr).noalias() = m_permMinv.block(m_m, 0, m_ncorr, m_ncorr) -
550 |             WP.rightCols(m_ncorr).transpose() * WP.leftCols(m_ncorr);
551 |         // [ncorr:(2 * ncorr - 1), ncorr:(2 * ncorr - 1)]
552 |         for (int j = 0; j < m_ncorr; j++)
553 |         {
554 |             mid.col(m_ncorr + j).segment(m_ncorr + j, m_ncorr - j).noalias() = m_theta *
555 |                 (m_permMinv.col(m_m + j).segment(m_m + j, m_ncorr - j) - WP.rightCols(m_ncorr - j).transpose() * WP.col(m_ncorr + j));
556 |         }
557 |         // Factorization
558 |         BKLDLT<Scalar> midsolver(mid);
559 |         // Compute the final result
560 |         Vector WPv = WP.transpose() * v;
561 |         WPv.tail(m_ncorr) *= m_theta;
562 |         midsolver.solve_inplace(WPv);
563 |         WPv.tail(m_ncorr) *= m_theta;
564 |         res.noalias() = v / m_theta + (WP * WPv) / (m_theta * m_theta);
565 |     }
566 | 
567 |     // Compute P'BQv, where P and Q are two mutually exclusive index selection operators
568 |     // P'BQv = -WP * M * WQ' * v
569 |     // Returns false if the result is known to be zero
570 |     inline bool apply_PtBQv(const Matrix& WP, const IndexSet& Q_set, const Vector& v, Vector& res, bool test_zero = false) const
571 |     {
572 |         const int nP = WP.rows();
573 |         const int nQ = Q_set.size();
574 |         res.resize(nP);
575 |         if (m_ncorr < 1 || nP < 1 || nQ < 1)
576 |         {
577 |             res.setZero();
578 |             return false;
579 |         }
580 | 
581 |         Vector WQtv;
582 |         bool nonzero = apply_WtPv(Q_set, v, WQtv, test_zero);
583 |         if (!nonzero)
584 |         {
585 |             res.setZero();
586 |             return false;
587 |         }
588 | 
589 |         Vector MWQtv;
590 |         apply_Mv(WQtv, MWQtv);
591 |         MWQtv.tail(m_ncorr) *= m_theta;
592 |         res.noalias() = -WP * MWQtv;
593 |         return true;
594 |     }
595 |     // If the Q'W matrix has been explicitly formed, do a direct matrix multiplication
596 |     inline bool apply_PtBQv(const Matrix& WP, const Matrix& WQ, const Vector& v, Vector& res) const
597 |     {
598 |         const int nP = WP.rows();
599 |         const int nQ = WQ.rows();
600 |         res.resize(nP);
601 |         if (m_ncorr < 1 || nP < 1 || nQ < 1)
602 |         {
603 |             res.setZero();
604 |             return false;
605 |         }
606 | 
607 |         // Remember that W = [Y, theta * S], so we need to multiply theta to the second half
608 |         Vector WQtv = WQ.transpose() * v;
609 |         WQtv.tail(m_ncorr) *= m_theta;
610 |         Vector MWQtv;
611 |         apply_Mv(WQtv, MWQtv);
612 |         MWQtv.tail(m_ncorr) *= m_theta;
613 |         res.noalias() = -WP * MWQtv;
614 |         return true;
615 |     }
616 | };
617 | 
618 | }  // namespace LBFGSpp
619 | 
620 | /// \endcond
621 | 
622 | #endif  // LBFGSPP_BFGS_MAT_H
623 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/BKLDLT.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_BK_LDLT_H
  5 | #define LBFGSPP_BK_LDLT_H
  6 | 
  7 | #include <vector>
  8 | #include <stdexcept>
  9 | #include <Eigen/Core>
 10 | 
 11 | /// \cond
 12 | 
 13 | namespace LBFGSpp {
 14 | 
 15 | enum COMPUTATION_INFO
 16 | {
 17 |     SUCCESSFUL = 0,
 18 |     NOT_COMPUTED,
 19 |     NUMERICAL_ISSUE
 20 | };
 21 | 
 22 | // Bunch-Kaufman LDLT decomposition
 23 | // References:
 24 | // 1. Bunch, J. R., & Kaufman, L. (1977). Some stable methods for calculating inertia and solving symmetric linear systems.
 25 | //    Mathematics of computation, 31(137), 163-179.
 26 | // 2. Golub, G. H., & Van Loan, C. F. (2012). Matrix computations (Vol. 3). JHU press. Section 4.4.
 27 | // 3. Bunch-Parlett diagonal pivoting <http://oz.nthu.edu.tw/~d947207/Chap13_GE3.ppt>
 28 | // 4. Ashcraft, C., Grimes, R. G., & Lewis, J. G. (1998). Accurate symmetric indefinite linear equation solvers.
 29 | //    SIAM Journal on Matrix Analysis and Applications, 20(2), 513-561.
 30 | template <typename Scalar = double>
 31 | class BKLDLT
 32 | {
 33 | private:
 34 |     using Index = Eigen::Index;
 35 |     using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
 36 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 37 |     using MapVec = Eigen::Map<Vector>;
 38 |     using MapConstVec = Eigen::Map<const Vector>;
 39 | 
 40 |     using IntVector = Eigen::Matrix<Index, Eigen::Dynamic, 1>;
 41 |     using GenericVector = Eigen::Ref<Vector>;
 42 |     using GenericMatrix = Eigen::Ref<Matrix>;
 43 |     using ConstGenericMatrix = const Eigen::Ref<const Matrix>;
 44 |     using ConstGenericVector = const Eigen::Ref<const Vector>;
 45 | 
 46 |     Index m_n;
 47 |     Vector m_data;                                  // storage for a lower-triangular matrix
 48 |     std::vector<Scalar*> m_colptr;                  // pointers to columns
 49 |     IntVector m_perm;                               // [-2, -1, 3, 1, 4, 5]: 0 <-> 2, 1 <-> 1, 2 <-> 3, 3 <-> 1, 4 <-> 4, 5 <-> 5
 50 |     std::vector<std::pair<Index, Index> > m_permc;  // compressed version of m_perm: [(0, 2), (2, 3), (3, 1)]
 51 | 
 52 |     bool m_computed;
 53 |     int m_info;
 54 | 
 55 |     // Access to elements
 56 |     // Pointer to the k-th column
 57 |     Scalar* col_pointer(Index k) { return m_colptr[k]; }
 58 |     // A[i, j] -> m_colptr[j][i - j], i >= j
 59 |     Scalar& coeff(Index i, Index j) { return m_colptr[j][i - j]; }
 60 |     const Scalar& coeff(Index i, Index j) const { return m_colptr[j][i - j]; }
 61 |     // A[i, i] -> m_colptr[i][0]
 62 |     Scalar& diag_coeff(Index i) { return m_colptr[i][0]; }
 63 |     const Scalar& diag_coeff(Index i) const { return m_colptr[i][0]; }
 64 | 
 65 |     // Compute column pointers
 66 |     void compute_pointer()
 67 |     {
 68 |         m_colptr.clear();
 69 |         m_colptr.reserve(m_n);
 70 |         Scalar* head = m_data.data();
 71 | 
 72 |         for (Index i = 0; i < m_n; i++)
 73 |         {
 74 |             m_colptr.push_back(head);
 75 |             head += (m_n - i);
 76 |         }
 77 |     }
 78 | 
 79 |     // Copy mat - shift * I to m_data
 80 |     void copy_data(ConstGenericMatrix& mat, int uplo, const Scalar& shift)
 81 |     {
 82 |         if (uplo == Eigen::Lower)
 83 |         {
 84 |             for (Index j = 0; j < m_n; j++)
 85 |             {
 86 |                 const Scalar* begin = &mat.coeffRef(j, j);
 87 |                 const Index len = m_n - j;
 88 |                 std::copy(begin, begin + len, col_pointer(j));
 89 |                 diag_coeff(j) -= shift;
 90 |             }
 91 |         }
 92 |         else
 93 |         {
 94 |             Scalar* dest = m_data.data();
 95 |             for (Index i = 0; i < m_n; i++)
 96 |             {
 97 |                 for (Index j = i; j < m_n; j++, dest++)
 98 |                 {
 99 |                     *dest = mat.coeff(i, j);
100 |                 }
101 |                 diag_coeff(i) -= shift;
102 |             }
103 |         }
104 |     }
105 | 
106 |     // Compute compressed permutations
107 |     void compress_permutation()
108 |     {
109 |         for (Index i = 0; i < m_n; i++)
110 |         {
111 |             // Recover the permutation action
112 |             const Index perm = (m_perm[i] >= 0) ? (m_perm[i]) : (-m_perm[i] - 1);
113 |             if (perm != i)
114 |                 m_permc.push_back(std::make_pair(i, perm));
115 |         }
116 |     }
117 | 
118 |     // Working on the A[k:end, k:end] submatrix
119 |     // Exchange k <-> r
120 |     // Assume r >= k
121 |     void pivoting_1x1(Index k, Index r)
122 |     {
123 |         // No permutation
124 |         if (k == r)
125 |         {
126 |             m_perm[k] = r;
127 |             return;
128 |         }
129 | 
130 |         // A[k, k] <-> A[r, r]
131 |         std::swap(diag_coeff(k), diag_coeff(r));
132 | 
133 |         // A[(r+1):end, k] <-> A[(r+1):end, r]
134 |         std::swap_ranges(&coeff(r + 1, k), col_pointer(k + 1), &coeff(r + 1, r));
135 | 
136 |         // A[(k+1):(r-1), k] <-> A[r, (k+1):(r-1)]
137 |         Scalar* src = &coeff(k + 1, k);
138 |         for (Index j = k + 1; j < r; j++, src++)
139 |         {
140 |             std::swap(*src, coeff(r, j));
141 |         }
142 | 
143 |         m_perm[k] = r;
144 |     }
145 | 
146 |     // Working on the A[k:end, k:end] submatrix
147 |     // Exchange [k+1, k] <-> [r, p]
148 |     // Assume p >= k, r >= k+1
149 |     void pivoting_2x2(Index k, Index r, Index p)
150 |     {
151 |         pivoting_1x1(k, p);
152 |         pivoting_1x1(k + 1, r);
153 | 
154 |         // A[k+1, k] <-> A[r, k]
155 |         std::swap(coeff(k + 1, k), coeff(r, k));
156 | 
157 |         // Use negative signs to indicate a 2x2 block
158 |         // Also minus one to distinguish a negative zero from a positive zero
159 |         m_perm[k] = -m_perm[k] - 1;
160 |         m_perm[k + 1] = -m_perm[k + 1] - 1;
161 |     }
162 | 
163 |     // A[r1, c1:c2] <-> A[r2, c1:c2]
164 |     // Assume r2 >= r1 > c2 >= c1
165 |     void interchange_rows(Index r1, Index r2, Index c1, Index c2)
166 |     {
167 |         if (r1 == r2)
168 |             return;
169 | 
170 |         for (Index j = c1; j <= c2; j++)
171 |         {
172 |             std::swap(coeff(r1, j), coeff(r2, j));
173 |         }
174 |     }
175 | 
176 |     // lambda = |A[r, k]| = max{|A[k+1, k]|, ..., |A[end, k]|}
177 |     // Largest (in magnitude) off-diagonal element in the first column of the current reduced matrix
178 |     // r is the row index
179 |     // Assume k < end
180 |     Scalar find_lambda(Index k, Index& r)
181 |     {
182 |         using std::abs;
183 | 
184 |         const Scalar* head = col_pointer(k);  // => A[k, k]
185 |         const Scalar* end = col_pointer(k + 1);
186 |         // Start with r=k+1, lambda=A[k+1, k]
187 |         r = k + 1;
188 |         Scalar lambda = abs(head[1]);
189 |         // Scan remaining elements
190 |         for (const Scalar* ptr = head + 2; ptr < end; ptr++)
191 |         {
192 |             const Scalar abs_elem = abs(*ptr);
193 |             if (lambda < abs_elem)
194 |             {
195 |                 lambda = abs_elem;
196 |                 r = k + (ptr - head);
197 |             }
198 |         }
199 | 
200 |         return lambda;
201 |     }
202 | 
203 |     // sigma = |A[p, r]| = max {|A[k, r]|, ..., |A[end, r]|} \ {A[r, r]}
204 |     // Largest (in magnitude) off-diagonal element in the r-th column of the current reduced matrix
205 |     // p is the row index
206 |     // Assume k < r < end
207 |     Scalar find_sigma(Index k, Index r, Index& p)
208 |     {
209 |         using std::abs;
210 | 
211 |         // First search A[r+1, r], ...,  A[end, r], which has the same task as find_lambda()
212 |         // If r == end, we skip this search
213 |         Scalar sigma = Scalar(-1);
214 |         if (r < m_n - 1)
215 |             sigma = find_lambda(r, p);
216 | 
217 |         // Then search A[k, r], ..., A[r-1, r], which maps to A[r, k], ..., A[r, r-1]
218 |         for (Index j = k; j < r; j++)
219 |         {
220 |             const Scalar abs_elem = abs(coeff(r, j));
221 |             if (sigma < abs_elem)
222 |             {
223 |                 sigma = abs_elem;
224 |                 p = j;
225 |             }
226 |         }
227 | 
228 |         return sigma;
229 |     }
230 | 
231 |     // Generate permutations and apply to A
232 |     // Return true if the resulting pivoting is 1x1, and false if 2x2
233 |     bool permutate_mat(Index k, const Scalar& alpha)
234 |     {
235 |         using std::abs;
236 | 
237 |         Index r = k, p = k;
238 |         const Scalar lambda = find_lambda(k, r);
239 | 
240 |         // If lambda=0, no need to interchange
241 |         if (lambda > Scalar(0))
242 |         {
243 |             const Scalar abs_akk = abs(diag_coeff(k));
244 |             // If |A[k, k]| >= alpha * lambda, no need to interchange
245 |             if (abs_akk < alpha * lambda)
246 |             {
247 |                 const Scalar sigma = find_sigma(k, r, p);
248 | 
249 |                 // If sigma * |A[k, k]| >= alpha * lambda^2, no need to interchange
250 |                 if (sigma * abs_akk < alpha * lambda * lambda)
251 |                 {
252 |                     if (abs_akk >= alpha * sigma)
253 |                     {
254 |                         // Permutation on A
255 |                         pivoting_1x1(k, r);
256 | 
257 |                         // Permutation on L
258 |                         interchange_rows(k, r, 0, k - 1);
259 |                         return true;
260 |                     }
261 |                     else
262 |                     {
263 |                         // There are two versions of permutation here
264 |                         // 1. A[k+1, k] <-> A[r, k]
265 |                         // 2. A[k+1, k] <-> A[r, p], where p >= k and r >= k+1
266 |                         //
267 |                         // Version 1 and 2 are used by Ref[1] and Ref[2], respectively
268 | 
269 |                         // Version 1 implementation
270 |                         p = k;
271 | 
272 |                         // Version 2 implementation
273 |                         // [r, p] and [p, r] are symmetric, but we need to make sure
274 |                         // p >= k and r >= k+1, so it is safe to always make r > p
275 |                         // One exception is when min{r,p} == k+1, in which case we make
276 |                         // r = k+1, so that only one permutation needs to be performed
277 |                         /* const Index rp_min = std::min(r, p);
278 |                         const Index rp_max = std::max(r, p);
279 |                         if(rp_min == k + 1)
280 |                         {
281 |                             r = rp_min; p = rp_max;
282 |                         } else {
283 |                             r = rp_max; p = rp_min;
284 |                         } */
285 | 
286 |                         // Right now we use Version 1 since it reduces the overhead of interchange
287 | 
288 |                         // Permutation on A
289 |                         pivoting_2x2(k, r, p);
290 |                         // Permutation on L
291 |                         interchange_rows(k, p, 0, k - 1);
292 |                         interchange_rows(k + 1, r, 0, k - 1);
293 |                         return false;
294 |                     }
295 |                 }
296 |             }
297 |         }
298 | 
299 |         return true;
300 |     }
301 | 
302 |     // E = [e11, e12]
303 |     //     [e21, e22]
304 |     // Overwrite E with inv(E)
305 |     void inverse_inplace_2x2(Scalar& e11, Scalar& e21, Scalar& e22) const
306 |     {
307 |         // inv(E) = [d11, d12], d11 = e22/delta, d21 = -e21/delta, d22 = e11/delta
308 |         //          [d21, d22]
309 |         const Scalar delta = e11 * e22 - e21 * e21;
310 |         std::swap(e11, e22);
311 |         e11 /= delta;
312 |         e22 /= delta;
313 |         e21 = -e21 / delta;
314 |     }
315 | 
316 |     // Return value is the status, SUCCESSFUL/NUMERICAL_ISSUE
317 |     int gaussian_elimination_1x1(Index k)
318 |     {
319 |         // D = 1 / A[k, k]
320 |         const Scalar akk = diag_coeff(k);
321 |         // Return NUMERICAL_ISSUE if not invertible
322 |         if (akk == Scalar(0))
323 |             return NUMERICAL_ISSUE;
324 | 
325 |         diag_coeff(k) = Scalar(1) / akk;
326 | 
327 |         // B -= l * l' / A[k, k], B := A[(k+1):end, (k+1):end], l := L[(k+1):end, k]
328 |         Scalar* lptr = col_pointer(k) + 1;
329 |         const Index ldim = m_n - k - 1;
330 |         MapVec l(lptr, ldim);
331 |         for (Index j = 0; j < ldim; j++)
332 |         {
333 |             MapVec(col_pointer(j + k + 1), ldim - j).noalias() -= (lptr[j] / akk) * l.tail(ldim - j);
334 |         }
335 | 
336 |         // l /= A[k, k]
337 |         l /= akk;
338 | 
339 |         return SUCCESSFUL;
340 |     }
341 | 
342 |     // Return value is the status, SUCCESSFUL/NUMERICAL_ISSUE
343 |     int gaussian_elimination_2x2(Index k)
344 |     {
345 |         // D = inv(E)
346 |         Scalar& e11 = diag_coeff(k);
347 |         Scalar& e21 = coeff(k + 1, k);
348 |         Scalar& e22 = diag_coeff(k + 1);
349 |         // Return NUMERICAL_ISSUE if not invertible
350 |         if (e11 * e22 - e21 * e21 == Scalar(0))
351 |             return NUMERICAL_ISSUE;
352 | 
353 |         inverse_inplace_2x2(e11, e21, e22);
354 | 
355 |         // X = l * inv(E), l := L[(k+2):end, k:(k+1)]
356 |         Scalar* l1ptr = &coeff(k + 2, k);
357 |         Scalar* l2ptr = &coeff(k + 2, k + 1);
358 |         const Index ldim = m_n - k - 2;
359 |         MapVec l1(l1ptr, ldim), l2(l2ptr, ldim);
360 | 
361 |         Eigen::Matrix<Scalar, Eigen::Dynamic, 2> X(ldim, 2);
362 |         X.col(0).noalias() = l1 * e11 + l2 * e21;
363 |         X.col(1).noalias() = l1 * e21 + l2 * e22;
364 | 
365 |         // B -= l * inv(E) * l' = X * l', B = A[(k+2):end, (k+2):end]
366 |         for (Index j = 0; j < ldim; j++)
367 |         {
368 |             MapVec(col_pointer(j + k + 2), ldim - j).noalias() -= (X.col(0).tail(ldim - j) * l1ptr[j] + X.col(1).tail(ldim - j) * l2ptr[j]);
369 |         }
370 | 
371 |         // l = X
372 |         l1.noalias() = X.col(0);
373 |         l2.noalias() = X.col(1);
374 | 
375 |         return SUCCESSFUL;
376 |     }
377 | 
378 | public:
379 |     BKLDLT() :
380 |         m_n(0), m_computed(false), m_info(NOT_COMPUTED)
381 |     {}
382 | 
383 |     // Factorize mat - shift * I
384 |     BKLDLT(ConstGenericMatrix& mat, int uplo = Eigen::Lower, const Scalar& shift = Scalar(0)) :
385 |         m_n(mat.rows()), m_computed(false), m_info(NOT_COMPUTED)
386 |     {
387 |         compute(mat, uplo, shift);
388 |     }
389 | 
390 |     void compute(ConstGenericMatrix& mat, int uplo = Eigen::Lower, const Scalar& shift = Scalar(0))
391 |     {
392 |         using std::abs;
393 | 
394 |         m_n = mat.rows();
395 |         if (m_n != mat.cols())
396 |             throw std::invalid_argument("BKLDLT: matrix must be square");
397 | 
398 |         m_perm.setLinSpaced(m_n, 0, m_n - 1);
399 |         m_permc.clear();
400 | 
401 |         // Copy data
402 |         m_data.resize((m_n * (m_n + 1)) / 2);
403 |         compute_pointer();
404 |         copy_data(mat, uplo, shift);
405 | 
406 |         const Scalar alpha = (1.0 + std::sqrt(17.0)) / 8.0;
407 |         Index k = 0;
408 |         for (k = 0; k < m_n - 1; k++)
409 |         {
410 |             // 1. Interchange rows and columns of A, and save the result to m_perm
411 |             bool is_1x1 = permutate_mat(k, alpha);
412 | 
413 |             // 2. Gaussian elimination
414 |             if (is_1x1)
415 |             {
416 |                 m_info = gaussian_elimination_1x1(k);
417 |             }
418 |             else
419 |             {
420 |                 m_info = gaussian_elimination_2x2(k);
421 |                 k++;
422 |             }
423 | 
424 |             // 3. Check status
425 |             if (m_info != SUCCESSFUL)
426 |                 break;
427 |         }
428 |         // Invert the last 1x1 block if it exists
429 |         if (k == m_n - 1)
430 |         {
431 |             const Scalar akk = diag_coeff(k);
432 |             if (akk == Scalar(0))
433 |                 m_info = NUMERICAL_ISSUE;
434 | 
435 |             diag_coeff(k) = Scalar(1) / diag_coeff(k);
436 |         }
437 | 
438 |         compress_permutation();
439 | 
440 |         m_computed = true;
441 |     }
442 | 
443 |     // Solve Ax=b
444 |     void solve_inplace(GenericVector b) const
445 |     {
446 |         if (!m_computed)
447 |             throw std::logic_error("BKLDLT: need to call compute() first");
448 | 
449 |         // PAP' = LDL'
450 |         // 1. b -> Pb
451 |         Scalar* x = b.data();
452 |         MapVec res(x, m_n);
453 |         Index npermc = m_permc.size();
454 |         for (Index i = 0; i < npermc; i++)
455 |         {
456 |             std::swap(x[m_permc[i].first], x[m_permc[i].second]);
457 |         }
458 | 
459 |         // 2. Lz = Pb
460 |         // If m_perm[end] < 0, then end with m_n - 3, otherwise end with m_n - 2
461 |         const Index end = (m_perm[m_n - 1] < 0) ? (m_n - 3) : (m_n - 2);
462 |         for (Index i = 0; i <= end; i++)
463 |         {
464 |             const Index b1size = m_n - i - 1;
465 |             const Index b2size = b1size - 1;
466 |             if (m_perm[i] >= 0)
467 |             {
468 |                 MapConstVec l(&coeff(i + 1, i), b1size);
469 |                 res.segment(i + 1, b1size).noalias() -= l * x[i];
470 |             }
471 |             else
472 |             {
473 |                 MapConstVec l1(&coeff(i + 2, i), b2size);
474 |                 MapConstVec l2(&coeff(i + 2, i + 1), b2size);
475 |                 res.segment(i + 2, b2size).noalias() -= (l1 * x[i] + l2 * x[i + 1]);
476 |                 i++;
477 |             }
478 |         }
479 | 
480 |         // 3. Dw = z
481 |         for (Index i = 0; i < m_n; i++)
482 |         {
483 |             const Scalar e11 = diag_coeff(i);
484 |             if (m_perm[i] >= 0)
485 |             {
486 |                 x[i] *= e11;
487 |             }
488 |             else
489 |             {
490 |                 const Scalar e21 = coeff(i + 1, i), e22 = diag_coeff(i + 1);
491 |                 const Scalar wi = x[i] * e11 + x[i + 1] * e21;
492 |                 x[i + 1] = x[i] * e21 + x[i + 1] * e22;
493 |                 x[i] = wi;
494 |                 i++;
495 |             }
496 |         }
497 | 
498 |         // 4. L'y = w
499 |         // If m_perm[end] < 0, then start with m_n - 3, otherwise start with m_n - 2
500 |         Index i = (m_perm[m_n - 1] < 0) ? (m_n - 3) : (m_n - 2);
501 |         for (; i >= 0; i--)
502 |         {
503 |             const Index ldim = m_n - i - 1;
504 |             MapConstVec l(&coeff(i + 1, i), ldim);
505 |             x[i] -= res.segment(i + 1, ldim).dot(l);
506 | 
507 |             if (m_perm[i] < 0)
508 |             {
509 |                 MapConstVec l2(&coeff(i + 1, i - 1), ldim);
510 |                 x[i - 1] -= res.segment(i + 1, ldim).dot(l2);
511 |                 i--;
512 |             }
513 |         }
514 | 
515 |         // 5. x = P'y
516 |         for (i = npermc - 1; i >= 0; i--)
517 |         {
518 |             std::swap(x[m_permc[i].first], x[m_permc[i].second]);
519 |         }
520 |     }
521 | 
522 |     Vector solve(ConstGenericVector& b) const
523 |     {
524 |         Vector res = b;
525 |         solve_inplace(res);
526 |         return res;
527 |     }
528 | 
529 |     int info() const { return m_info; }
530 | };
531 | 
532 | }  // namespace LBFGSpp
533 | 
534 | /// \endcond
535 | 
536 | #endif  // LBFGSPP_BK_LDLT_H
537 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/Cauchy.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_CAUCHY_H
  5 | #define LBFGSPP_CAUCHY_H
  6 | 
  7 | #include <vector>
  8 | #include <Eigen/Core>
  9 | #include "BFGSMat.h"
 10 | 
 11 | /// \cond
 12 | 
 13 | namespace LBFGSpp {
 14 | 
 15 | //
 16 | // Class to compute the generalized Cauchy point (GCP) for the L-BFGS-B algorithm,
 17 | // mainly for internal use.
 18 | //
 19 | // The target of the GCP procedure is to find a step size t such that
 20 | // x(t) = x0 - t * g is a local minimum of the quadratic function m(x),
 21 | // where m(x) is a local approximation to the objective function.
 22 | //
 23 | // First determine a sequence of break points t0=0, t1, t2, ..., tn.
 24 | // On each interval [t[i-1], t[i]], x is changing linearly.
 25 | // After passing a break point, one or more coordinates of x will be fixed at the bounds.
 26 | // We search the first local minimum of m(x) by examining the intervals [t[i-1], t[i]] sequentially.
 27 | //
 28 | // Reference:
 29 | // [1] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization.
 30 | //
 31 | template <typename Scalar>
 32 | class ArgSort
 33 | {
 34 | private:
 35 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 36 |     using IndexSet = std::vector<int>;
 37 | 
 38 |     const Scalar* values;
 39 | 
 40 | public:
 41 |     ArgSort(const Vector& value_vec) :
 42 |         values(value_vec.data())
 43 |     {}
 44 | 
 45 |     inline bool operator()(int key1, int key2) { return values[key1] < values[key2]; }
 46 |     inline void sort_key(IndexSet& key_vec) const
 47 |     {
 48 |         std::sort(key_vec.begin(), key_vec.end(), *this);
 49 |     }
 50 | };
 51 | 
 52 | template <typename Scalar>
 53 | class Cauchy
 54 | {
 55 | private:
 56 |     typedef Eigen::Matrix<Scalar, Eigen::Dynamic, 1> Vector;
 57 |     typedef Eigen::Matrix<int, Eigen::Dynamic, 1> IntVector;
 58 |     typedef Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> Matrix;
 59 |     typedef std::vector<int> IndexSet;
 60 | 
 61 |     // Find the smallest index i such that brk[ord[i]] > t, assuming brk[ord] is already sorted.
 62 |     // If the return value equals n, then all values are <= t.
 63 |     static int search_greater(const Vector& brk, const IndexSet& ord, const Scalar& t, int start = 0)
 64 |     {
 65 |         const int nord = ord.size();
 66 |         int i;
 67 |         for (i = start; i < nord; i++)
 68 |         {
 69 |             if (brk[ord[i]] > t)
 70 |                 break;
 71 |         }
 72 | 
 73 |         return i;
 74 |     }
 75 | 
 76 | public:
 77 |     // bfgs:       An object that represents the BFGS approximation matrix.
 78 |     // x0:         Current parameter vector.
 79 |     // g:          Gradient at x0.
 80 |     // lb:         Lower bounds for x.
 81 |     // ub:         Upper bounds for x.
 82 |     // xcp:        The output generalized Cauchy point.
 83 |     // vecc:       c = W'(xcp - x0), used in the subspace minimization routine.
 84 |     // newact_set: Coordinates that newly become active during the GCP procedure.
 85 |     // fv_set:     Free variable set.
 86 |     static void get_cauchy_point(
 87 |         const BFGSMat<Scalar, true>& bfgs, const Vector& x0, const Vector& g, const Vector& lb, const Vector& ub,
 88 |         Vector& xcp, Vector& vecc, IndexSet& newact_set, IndexSet& fv_set)
 89 |     {
 90 |         // std::cout << "========================= Entering GCP search =========================\n\n";
 91 | 
 92 |         // Initialization
 93 |         const int n = x0.size();
 94 |         xcp.resize(n);
 95 |         xcp.noalias() = x0;
 96 |         vecc.resize(2 * bfgs.num_corrections());
 97 |         vecc.setZero();
 98 |         newact_set.clear();
 99 |         newact_set.reserve(n);
100 |         fv_set.clear();
101 |         fv_set.reserve(n);
102 | 
103 |         // Construct break points
104 |         Vector brk(n), vecd(n);
105 |         // If brk[i] == 0, i belongs to active set
106 |         // If brk[i] == Inf, i belongs to free variable set
107 |         // Others are currently undecided
108 |         IndexSet ord;
109 |         ord.reserve(n);
110 |         const Scalar inf = std::numeric_limits<Scalar>::infinity();
111 |         for (int i = 0; i < n; i++)
112 |         {
113 |             if (lb[i] == ub[i])
114 |                 brk[i] = Scalar(0);
115 |             else if (g[i] < Scalar(0))
116 |                 brk[i] = (x0[i] - ub[i]) / g[i];
117 |             else if (g[i] > Scalar(0))
118 |                 brk[i] = (x0[i] - lb[i]) / g[i];
119 |             else
120 |                 brk[i] = inf;
121 | 
122 |             const bool iszero = (brk[i] == Scalar(0));
123 |             vecd[i] = iszero ? Scalar(0) : -g[i];
124 | 
125 |             if (brk[i] == inf)
126 |                 fv_set.push_back(i);
127 |             else if (!iszero)
128 |                 ord.push_back(i);
129 |         }
130 | 
131 |         // Sort indices of break points
132 |         ArgSort<Scalar> sorting(brk);
133 |         sorting.sort_key(ord);
134 | 
135 |         // Break points `brko := brk[ord]` are in increasing order
136 |         // `ord` contains the coordinates that define the corresponding break points
137 |         // brk[i] == 0 <=> The i-th coordinate is on the boundary
138 |         const int nord = ord.size();
139 |         const int nfree = fv_set.size();
140 |         if ((nfree < 1) && (nord < 1))
141 |         {
142 |             /* std::cout << "** All coordinates at boundary **\n";
143 |             std::cout << "\n========================= Leaving GCP search =========================\n\n"; */
144 |             return;
145 |         }
146 | 
147 |         // First interval: [il=0, iu=brk[ord[0]]]
148 |         // In case ord is empty, we take iu=Inf
149 | 
150 |         // p = W'd, c = 0
151 |         Vector vecp;
152 |         bfgs.apply_Wtv(vecd, vecp);
153 |         // f' = -d'd
154 |         Scalar fp = -vecd.squaredNorm();
155 |         // f'' = -theta * f' - p'Mp
156 |         Vector cache;
157 |         bfgs.apply_Mv(vecp, cache);  // cache = Mp
158 |         Scalar fpp = -bfgs.theta() * fp - vecp.dot(cache);
159 | 
160 |         // Theoretical step size to move
161 |         Scalar deltatmin = -fp / fpp;
162 | 
163 |         // Limit on the current interval
164 |         Scalar il = Scalar(0);
165 |         // We have excluded the case that max(brk) <= 0
166 |         int b = 0;
167 |         Scalar iu = (nord < 1) ? inf : brk[ord[b]];
168 |         Scalar deltat = iu - il;
169 | 
170 |         /* int iter = 0;
171 |         std::cout << "** Iter " << iter << " **\n";
172 |         std::cout << "   fp = " << fp << ", fpp = " << fpp << ", deltatmin = " << deltatmin << std::endl;
173 |         std::cout << "   il = " << il << ", iu = " << iu << ", deltat = " << deltat << std::endl; */
174 | 
175 |         // If deltatmin >= deltat, we need to do the following things:
176 |         // 1. Update vecc
177 |         // 2. Since we are going to cross iu, the coordinates that define iu become active
178 |         // 3. Update some quantities on these new active coordinates (xcp, vecd, vecp)
179 |         // 4. Move to the next interval and compute the new deltatmin
180 |         bool crossed_all = false;
181 |         const int ncorr = bfgs.num_corrections();
182 |         Vector wact(2 * ncorr);
183 |         while (deltatmin >= deltat)
184 |         {
185 |             // Step 1
186 |             vecc.noalias() += deltat * vecp;
187 | 
188 |             // Step 2
189 |             // First check how many coordinates will be active when we cross the previous iu
190 |             // b is the smallest number such that brko[b] == iu
191 |             // Let bp be the largest number such that brko[bp] == iu
192 |             // Then coordinates ord[b] to ord[bp] will be active
193 |             const int act_begin = b;
194 |             const int act_end = search_greater(brk, ord, iu, b) - 1;
195 | 
196 |             // If nfree == 0 and act_end == nord-1, then we have crossed all coordinates
197 |             // We only need to update xcp from ord[b] to ord[bp], and then exit
198 |             if ((nfree == 0) && (act_end == nord - 1))
199 |             {
200 |                 // std::cout << "** [ ";
201 |                 for (int i = act_begin; i <= act_end; i++)
202 |                 {
203 |                     const int act = ord[i];
204 |                     xcp[act] = (vecd[act] > Scalar(0)) ? ub[act] : lb[act];
205 |                     newact_set.push_back(act);
206 |                     // std::cout << act + 1 << " ";
207 |                 }
208 |                 // std::cout << "] become active **\n\n";
209 |                 // std::cout << "** All break points visited **\n\n";
210 | 
211 |                 crossed_all = true;
212 |                 break;
213 |             }
214 | 
215 |             // Step 3
216 |             // Update xcp and d on active coordinates
217 |             // std::cout << "** [ ";
218 |             fp += deltat * fpp;
219 |             for (int i = act_begin; i <= act_end; i++)
220 |             {
221 |                 const int act = ord[i];
222 |                 xcp[act] = (vecd[act] > Scalar(0)) ? ub[act] : lb[act];
223 |                 // z = xcp - x0
224 |                 const Scalar zact = xcp[act] - x0[act];
225 |                 const Scalar gact = g[act];
226 |                 const Scalar ggact = gact * gact;
227 |                 wact.noalias() = bfgs.Wb(act);
228 |                 bfgs.apply_Mv(wact, cache);  // cache = Mw
229 |                 fp += ggact + bfgs.theta() * gact * zact - gact * cache.dot(vecc);
230 |                 fpp -= (bfgs.theta() * ggact + 2 * gact * cache.dot(vecp) + ggact * cache.dot(wact));
231 |                 vecp.noalias() += gact * wact;
232 |                 vecd[act] = Scalar(0);
233 |                 newact_set.push_back(act);
234 |                 // std::cout << act + 1 << " ";
235 |             }
236 |             // std::cout << "] become active **\n\n";
237 | 
238 |             // Step 4
239 |             // Theoretical step size to move
240 |             deltatmin = -fp / fpp;
241 |             // Update interval bound
242 |             il = iu;
243 |             b = act_end + 1;
244 |             // If we have visited all finite-valued break points, and have not exited earlier,
245 |             // then the next iu will be infinity. Simply exit the loop now
246 |             if (b >= nord)
247 |                 break;
248 |             iu = brk[ord[b]];
249 |             // Width of the current interval
250 |             deltat = iu - il;
251 | 
252 |             /* iter++;
253 |             std::cout << "** Iter " << iter << " **\n";
254 |             std::cout << "   fp = " << fp << ", fpp = " << fpp << ", deltatmin = " << deltatmin << std::endl;
255 |             std::cout << "   il = " << il << ", iu = " << iu << ", deltat = " << deltat << std::endl; */
256 |         }
257 | 
258 |         // In some rare cases fpp is numerically zero, making deltatmin equal to Inf
259 |         // If this happens, force fpp to be the machine precision
260 |         const Scalar eps = std::numeric_limits<Scalar>::epsilon();
261 |         if (fpp < eps)
262 |             deltatmin = -fp / eps;
263 | 
264 |         // Last step
265 |         if (!crossed_all)
266 |         {
267 |             deltatmin = std::max(deltatmin, Scalar(0));
268 |             vecc.noalias() += deltatmin * vecp;
269 |             const Scalar tfinal = il + deltatmin;
270 |             // Update xcp on free variable coordinates
271 |             for (int i = 0; i < nfree; i++)
272 |             {
273 |                 const int coord = fv_set[i];
274 |                 xcp[coord] = x0[coord] + tfinal * vecd[coord];
275 |             }
276 |             for (int i = b; i < nord; i++)
277 |             {
278 |                 const int coord = ord[i];
279 |                 xcp[coord] = x0[coord] + tfinal * vecd[coord];
280 |                 fv_set.push_back(coord);
281 |             }
282 |         }
283 |         // std::cout << "\n========================= Leaving GCP search =========================\n\n";
284 |     }
285 | };
286 | 
287 | }  // namespace LBFGSpp
288 | 
289 | /// \endcond
290 | 
291 | #endif  // LBFGSPP_CAUCHY_H
292 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/LineSearchBacktracking.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2016-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_LINE_SEARCH_BACKTRACKING_H
  5 | #define LBFGSPP_LINE_SEARCH_BACKTRACKING_H
  6 | 
  7 | #include <Eigen/Core>
  8 | #include <stdexcept>  // std::runtime_error
  9 | #include "Param.h"
 10 | 
 11 | namespace LBFGSpp {
 12 | 
 13 | ///
 14 | /// The backtracking line search algorithm for L-BFGS. Mainly for internal use.
 15 | ///
 16 | template <typename Scalar>
 17 | class LineSearchBacktracking
 18 | {
 19 | private:
 20 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 21 | 
 22 | public:
 23 |     ///
 24 |     /// Line search by backtracking.
 25 |     ///
 26 |     /// \param f        A function object such that `f(x, grad)` returns the
 27 |     ///                 objective function value at `x`, and overwrites `grad` with
 28 |     ///                 the gradient.
 29 |     /// \param param    Parameters for the L-BFGS algorithm.
 30 |     /// \param xp       The current point.
 31 |     /// \param drt      The current moving direction.
 32 |     /// \param step_max The upper bound for the step size that makes x feasible.
 33 |     ///                 Can be ignored for the L-BFGS solver.
 34 |     /// \param step     In: The initial step length.
 35 |     ///                 Out: The calculated step length.
 36 |     /// \param fx       In: The objective function value at the current point.
 37 |     ///                 Out: The function value at the new point.
 38 |     /// \param grad     In: The current gradient vector.
 39 |     ///                 Out: The gradient at the new point.
 40 |     /// \param dg       In: The inner product between drt and grad.
 41 |     ///                 Out: The inner product between drt and the new gradient.
 42 |     /// \param x        Out: The new point moved to.
 43 |     ///
 44 |     template <typename Foo>
 45 |     static void LineSearch(Foo& f, const LBFGSParam<Scalar>& param,
 46 |                            const Vector& xp, const Vector& drt, const Scalar& step_max,
 47 |                            Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
 48 |     {
 49 |         // Decreasing and increasing factors
 50 |         const Scalar dec = 0.5;
 51 |         const Scalar inc = 2.1;
 52 | 
 53 |         // Check the value of step
 54 |         if (step <= Scalar(0))
 55 |             throw std::invalid_argument("'step' must be positive");
 56 | 
 57 |         // Save the function value at the current x
 58 |         const Scalar fx_init = fx;
 59 |         // Projection of gradient on the search direction
 60 |         const Scalar dg_init = grad.dot(drt);
 61 |         // Make sure d points to a descent direction
 62 |         if (dg_init > 0)
 63 |             throw std::logic_error("the moving direction increases the objective function value");
 64 | 
 65 |         const Scalar test_decr = param.ftol * dg_init;
 66 |         Scalar width;
 67 | 
 68 |         int iter;
 69 |         for (iter = 0; iter < param.max_linesearch; iter++)
 70 |         {
 71 |             // x_{k+1} = x_k + step * d_k
 72 |             x.noalias() = xp + step * drt;
 73 |             // Evaluate this candidate
 74 |             fx = f(x, grad);
 75 | 
 76 |             if (fx > fx_init + step * test_decr || (fx != fx))
 77 |             {
 78 |                 width = dec;
 79 |             }
 80 |             else
 81 |             {
 82 |                 dg = grad.dot(drt);
 83 | 
 84 |                 // Armijo condition is met
 85 |                 if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
 86 |                     break;
 87 | 
 88 |                 if (dg < param.wolfe * dg_init)
 89 |                 {
 90 |                     width = inc;
 91 |                 }
 92 |                 else
 93 |                 {
 94 |                     // Regular Wolfe condition is met
 95 |                     if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE)
 96 |                         break;
 97 | 
 98 |                     if (dg > -param.wolfe * dg_init)
 99 |                     {
100 |                         width = dec;
101 |                     }
102 |                     else
103 |                     {
104 |                         // Strong Wolfe condition is met
105 |                         break;
106 |                     }
107 |                 }
108 |             }
109 | 
110 |             if (step < param.min_step)
111 |                 throw std::runtime_error("the line search step became smaller than the minimum value allowed");
112 | 
113 |             if (step > param.max_step)
114 |                 throw std::runtime_error("the line search step became larger than the maximum value allowed");
115 | 
116 |             step *= width;
117 |         }
118 | 
119 |         if (iter >= param.max_linesearch)
120 |             throw std::runtime_error("the line search routine reached the maximum number of iterations");
121 |     }
122 | };
123 | 
124 | }  // namespace LBFGSpp
125 | 
126 | #endif  // LBFGSPP_LINE_SEARCH_BACKTRACKING_H
127 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/LineSearchBracketing.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2016-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Copyright (C) 2016-2025 Dirk Toewe <DirkToewe@GoogleMail.com>
  3 | // Under MIT license
  4 | 
  5 | #ifndef LBFGSPP_LINE_SEARCH_BRACKETING_H
  6 | #define LBFGSPP_LINE_SEARCH_BRACKETING_H
  7 | 
  8 | #include <Eigen/Core>
  9 | #include <stdexcept>  // std::runtime_error
 10 | #include "Param.h"
 11 | 
 12 | namespace LBFGSpp {
 13 | 
 14 | ///
 15 | /// The bracketing line search algorithm for L-BFGS. Mainly for internal use.
 16 | ///
 17 | template <typename Scalar>
 18 | class LineSearchBracketing
 19 | {
 20 | private:
 21 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 22 | 
 23 | public:
 24 |     ///
 25 |     /// Line search by bracketing. Similar to the backtracking line search
 26 |     /// except that it actively maintains an upper and lower bound of the
 27 |     /// current search range.
 28 |     ///
 29 |     /// \param f        A function object such that `f(x, grad)` returns the
 30 |     ///                 objective function value at `x`, and overwrites `grad` with
 31 |     ///                 the gradient.
 32 |     /// \param param    Parameters for the L-BFGS algorithm.
 33 |     /// \param xp       The current point.
 34 |     /// \param drt      The current moving direction.
 35 |     /// \param step_max The upper bound for the step size that makes x feasible.
 36 |     ///                 Can be ignored for the L-BFGS solver.
 37 |     /// \param step     In: The initial step length.
 38 |     ///                 Out: The calculated step length.
 39 |     /// \param fx       In: The objective function value at the current point.
 40 |     ///                 Out: The function value at the new point.
 41 |     /// \param grad     In: The current gradient vector.
 42 |     ///                 Out: The gradient at the new point.
 43 |     /// \param dg       In: The inner product between drt and grad.
 44 |     ///                 Out: The inner product between drt and the new gradient.
 45 |     /// \param x        Out: The new point moved to.
 46 |     ///
 47 |     template <typename Foo>
 48 |     static void LineSearch(Foo& f, const LBFGSParam<Scalar>& param,
 49 |                            const Vector& xp, const Vector& drt, const Scalar& step_max,
 50 |                            Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
 51 |     {
 52 |         // Check the value of step
 53 |         if (step <= Scalar(0))
 54 |             throw std::invalid_argument("'step' must be positive");
 55 | 
 56 |         // Save the function value at the current x
 57 |         const Scalar fx_init = fx;
 58 |         // Projection of gradient on the search direction
 59 |         const Scalar dg_init = grad.dot(drt);
 60 |         // Make sure d points to a descent direction
 61 |         if (dg_init > 0)
 62 |             throw std::logic_error("the moving direction increases the objective function value");
 63 | 
 64 |         const Scalar test_decr = param.ftol * dg_init;
 65 | 
 66 |         // Upper and lower end of the current line search range
 67 |         Scalar step_lo = 0,
 68 |                step_hi = std::numeric_limits<Scalar>::infinity();
 69 | 
 70 |         int iter;
 71 |         for (iter = 0; iter < param.max_linesearch; iter++)
 72 |         {
 73 |             // x_{k+1} = x_k + step * d_k
 74 |             x.noalias() = xp + step * drt;
 75 |             // Evaluate this candidate
 76 |             fx = f(x, grad);
 77 | 
 78 |             if (fx > fx_init + step * test_decr || (fx != fx))
 79 |             {
 80 |                 step_hi = step;
 81 |             }
 82 |             else
 83 |             {
 84 |                 dg = grad.dot(drt);
 85 | 
 86 |                 // Armijo condition is met
 87 |                 if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_ARMIJO)
 88 |                     break;
 89 | 
 90 |                 if (dg < param.wolfe * dg_init)
 91 |                 {
 92 |                     step_lo = step;
 93 |                 }
 94 |                 else
 95 |                 {
 96 |                     // Regular Wolfe condition is met
 97 |                     if (param.linesearch == LBFGS_LINESEARCH_BACKTRACKING_WOLFE)
 98 |                         break;
 99 | 
100 |                     if (dg > -param.wolfe * dg_init)
101 |                     {
102 |                         step_hi = step;
103 |                     }
104 |                     else
105 |                     {
106 |                         // Strong Wolfe condition is met
107 |                         break;
108 |                     }
109 |                 }
110 |             }
111 | 
112 |             assert(step_lo < step_hi);
113 | 
114 |             if (step < param.min_step)
115 |                 throw std::runtime_error("the line search step became smaller than the minimum value allowed");
116 | 
117 |             if (step > param.max_step)
118 |                 throw std::runtime_error("the line search step became larger than the maximum value allowed");
119 | 
120 |             // continue search in mid of current search range
121 |             step = std::isinf(step_hi) ? 2 * step : step_lo / 2 + step_hi / 2;
122 |         }
123 | 
124 |         if (iter >= param.max_linesearch)
125 |             throw std::runtime_error("the line search routine reached the maximum number of iterations");
126 |     }
127 | };
128 | 
129 | }  // namespace LBFGSpp
130 | 
131 | #endif  // LBFGSPP_LINE_SEARCH_BRACKETING_H
132 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/LineSearchMoreThuente.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
  5 | #define LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
  6 | 
  7 | #include <Eigen/Core>
  8 | #include <stdexcept>  // std::invalid_argument, std::runtime_error
  9 | #include "Param.h"
 10 | 
 11 | namespace LBFGSpp {
 12 | 
 13 | ///
 14 | /// The line search algorithm by Moré and Thuente (1994), currently used for the L-BFGS-B algorithm.
 15 | ///
 16 | /// The target of this line search algorithm is to find a step size \f$\alpha\f$ that satisfies the strong Wolfe condition
 17 | /// \f$f(x+\alpha d) \le f(x) + \alpha\mu g(x)^T d\f$ and \f$|g(x+\alpha d)^T d| \le \eta|g(x)^T d|\f$.
 18 | /// Our implementation is a simplified version of the algorithm in [1]. We assume that \f$0<\mu<\eta<1\f$, while in [1]
 19 | /// they do not assume \f$\eta>\mu\f$. As a result, the algorithm in [1] has two stages, but in our implementation we
 20 | /// only need the first stage to guarantee the convergence.
 21 | ///
 22 | /// Reference:
 23 | /// [1] Moré, J. J., & Thuente, D. J. (1994). Line search algorithms with guaranteed sufficient decrease.
 24 | ///
 25 | template <typename Scalar>
 26 | class LineSearchMoreThuente
 27 | {
 28 | private:
 29 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 30 | 
 31 |     // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
 32 |     // that interpolates fa, ga, and fb, assuming the minimizer exists
 33 |     // For case I: fb >= fa and ga * (b - a) < 0
 34 |     static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& ga, const Scalar& fb)
 35 |     {
 36 |         const Scalar ba = b - a;
 37 |         const Scalar w = Scalar(0.5) * ba * ga / (fa - fb + ba * ga);
 38 |         return a + w * ba;
 39 |     }
 40 | 
 41 |     // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
 42 |     // that interpolates fa, ga and gb, assuming the minimizer exists
 43 |     // The result actually does not depend on fa
 44 |     // For case II: ga * (b - a) < 0, ga * gb < 0
 45 |     // For case III: ga * (b - a) < 0, ga * ga >= 0, |gb| <= |ga|
 46 |     static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& ga, const Scalar& gb)
 47 |     {
 48 |         const Scalar w = ga / (ga - gb);
 49 |         return a + w * (b - a);
 50 |     }
 51 | 
 52 |     // Local minimizer of a cubic function q(x) = c0 + c1 * x + c2 * x^2 + c3 * x^3
 53 |     // that interpolates fa, ga, fb and gb, assuming a != b
 54 |     // Also sets a flag indicating whether the minimizer exists
 55 |     static Scalar cubic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& fb,
 56 |                                   const Scalar& ga, const Scalar& gb, bool& exists)
 57 |     {
 58 |         using std::abs;
 59 |         using std::sqrt;
 60 | 
 61 |         const Scalar apb = a + b;
 62 |         const Scalar ba = b - a;
 63 |         const Scalar ba2 = ba * ba;
 64 |         const Scalar fba = fb - fa;
 65 |         const Scalar gba = gb - ga;
 66 |         // z3 = c3 * (b-a)^3, z2 = c2 * (b-a)^3, z1 = c1 * (b-a)^3
 67 |         const Scalar z3 = (ga + gb) * ba - Scalar(2) * fba;
 68 |         const Scalar z2 = Scalar(0.5) * (gba * ba2 - Scalar(3) * apb * z3);
 69 |         const Scalar z1 = fba * ba2 - apb * z2 - (a * apb + b * b) * z3;
 70 |         // std::cout << "z1 = " << z1 << ", z2 = " << z2 << ", z3 = " << z3 << std::endl;
 71 | 
 72 |         // If c3 = z/(b-a)^3 == 0, reduce to quadratic problem
 73 |         const Scalar eps = std::numeric_limits<Scalar>::epsilon();
 74 |         if (abs(z3) < eps * abs(z2) || abs(z3) < eps * abs(z1))
 75 |         {
 76 |             // Minimizer exists if c2 > 0
 77 |             exists = (z2 * ba > Scalar(0));
 78 |             // Return the end point if the minimizer does not exist
 79 |             return exists ? (-Scalar(0.5) * z1 / z2) : b;
 80 |         }
 81 | 
 82 |         // Now we can assume z3 > 0
 83 |         // The minimizer is a solution to the equation c1 + 2*c2 * x + 3*c3 * x^2 = 0
 84 |         // roots = -(z2/z3) / 3 (+-) sqrt((z2/z3)^2 - 3 * (z1/z3)) / 3
 85 |         //
 86 |         // Let u = z2/(3z3) and v = z1/z2
 87 |         // The minimizer exists if v/u <= 1
 88 |         const Scalar u = z2 / (Scalar(3) * z3), v = z1 / z2;
 89 |         const Scalar vu = v / u;
 90 |         exists = (vu <= Scalar(1));
 91 |         if (!exists)
 92 |             return b;
 93 | 
 94 |         // We need to find a numerically stable way to compute the roots, as z3 may still be small
 95 |         //
 96 |         // If |u| >= |v|, let w = 1 + sqrt(1-v/u), and then
 97 |         // r1 = -u * w, r2 = -v / w, r1 does not need to be the smaller one
 98 |         //
 99 |         // If |u| < |v|, we must have uv <= 0, and then
100 |         // r = -u (+-) sqrt(delta), where
101 |         // sqrt(delta) = sqrt(|u|) * sqrt(|v|) * sqrt(1-u/v)
102 |         Scalar r1 = Scalar(0), r2 = Scalar(0);
103 |         if (abs(u) >= abs(v))
104 |         {
105 |             const Scalar w = Scalar(1) + sqrt(Scalar(1) - vu);
106 |             r1 = -u * w;
107 |             r2 = -v / w;
108 |         }
109 |         else
110 |         {
111 |             const Scalar sqrtd = sqrt(abs(u)) * sqrt(abs(v)) * sqrt(1 - u / v);
112 |             r1 = -u - sqrtd;
113 |             r2 = -u + sqrtd;
114 |         }
115 |         return (z3 * ba > Scalar(0)) ? ((std::max)(r1, r2)) : ((std::min)(r1, r2));
116 |     }
117 | 
118 |     // Select the next step size according to the current step sizes,
119 |     // function values, and derivatives
120 |     static Scalar step_selection(
121 |         const Scalar& al, const Scalar& au, const Scalar& at,
122 |         const Scalar& fl, const Scalar& fu, const Scalar& ft,
123 |         const Scalar& gl, const Scalar& gu, const Scalar& gt)
124 |     {
125 |         using std::abs;
126 | 
127 |         if (al == au)
128 |             return al;
129 | 
130 |         // If ft = Inf or gt = Inf, we return the middle point of al and at
131 |         if (!std::isfinite(ft) || !std::isfinite(gt))
132 |             return (al + at) / Scalar(2);
133 | 
134 |         // ac: cubic interpolation of fl, ft, gl, gt
135 |         // aq: quadratic interpolation of fl, gl, ft
136 |         bool ac_exists;
137 |         // std::cout << "al = " << al << ", at = " << at << ", fl = " << fl << ", ft = " << ft << ", gl = " << gl << ", gt = " << gt << std::endl;
138 |         const Scalar ac = cubic_minimizer(al, at, fl, ft, gl, gt, ac_exists);
139 |         const Scalar aq = quadratic_minimizer(al, at, fl, gl, ft);
140 |         // std::cout << "ac = " << ac << ", aq = " << aq << std::endl;
141 |         // Case 1: ft > fl
142 |         if (ft > fl)
143 |         {
144 |             // This should not happen if ft > fl, but just to be safe
145 |             if (!ac_exists)
146 |                 return aq;
147 |             // Then use the scheme described in the paper
148 |             return (abs(ac - al) < abs(aq - al)) ? ac : ((aq + ac) / Scalar(2));
149 |         }
150 | 
151 |         // as: quadratic interpolation of gl and gt
152 |         const Scalar as = quadratic_minimizer(al, at, gl, gt);
153 |         // Case 2: ft <= fl, gt * gl < 0
154 |         if (gt * gl < Scalar(0))
155 |             return (abs(ac - at) >= abs(as - at)) ? ac : as;
156 | 
157 |         // Case 3: ft <= fl, gt * gl >= 0, |gt| < |gl|
158 |         const Scalar deltal = Scalar(1.1), deltau = Scalar(0.66);
159 |         if (abs(gt) < abs(gl))
160 |         {
161 |             // We choose either ac or as
162 |             // The case for ac: 1. It exists, and
163 |             //                  2. ac is farther than at from al, and
164 |             //                  3. ac is closer to at than as
165 |             // Cases for as: otherwise
166 |             const Scalar res = (ac_exists &&
167 |                                 (ac - at) * (at - al) > Scalar(0) &&
168 |                                 abs(ac - at) < abs(as - at)) ?
169 |                 ac :
170 |                 as;
171 |             // Postprocessing the chosen step
172 |             return (at > al) ?
173 |                 std::min(at + deltau * (au - at), res) :
174 |                 std::max(at + deltau * (au - at), res);
175 |         }
176 | 
177 |         // Simple extrapolation if au, fu, or gu is infinity
178 |         if ((!std::isfinite(au)) || (!std::isfinite(fu)) || (!std::isfinite(gu)))
179 |             return at + deltal * (at - al);
180 | 
181 |         // ae: cubic interpolation of ft, fu, gt, gu
182 |         bool ae_exists;
183 |         const Scalar ae = cubic_minimizer(at, au, ft, fu, gt, gu, ae_exists);
184 |         // Case 4: ft <= fl, gt * gl >= 0, |gt| >= |gl|
185 |         // The following is not used in the paper, but it seems to be a reasonable safeguard
186 |         return (at > al) ?
187 |             std::min(at + deltau * (au - at), ae) :
188 |             std::max(at + deltau * (au - at), ae);
189 |     }
190 | 
191 | public:
192 |     ///
193 |     /// Line search by Moré and Thuente (1994).
194 |     ///
195 |     /// \param f        A function object such that `f(x, grad)` returns the
196 |     ///                 objective function value at `x`, and overwrites `grad` with
197 |     ///                 the gradient.
198 |     /// \param param    An `LBFGSParam` or `LBFGSBParam` object that stores the
199 |     ///                 parameters of the solver.
200 |     /// \param xp       The current point.
201 |     /// \param drt      The current moving direction.
202 |     /// \param step_max The upper bound for the step size that makes x feasible.
203 |     /// \param step     In: The initial step length.
204 |     ///                 Out: The calculated step length.
205 |     /// \param fx       In: The objective function value at the current point.
206 |     ///                 Out: The function value at the new point.
207 |     /// \param grad     In: The current gradient vector.
208 |     ///                 Out: The gradient at the new point.
209 |     /// \param dg       In: The inner product between drt and grad.
210 |     ///                 Out: The inner product between drt and the new gradient.
211 |     /// \param x        Out: The new point moved to.
212 |     ///
213 |     template <typename Foo, typename SolverParam>
214 |     static void LineSearch(Foo& f, const SolverParam& param,
215 |                            const Vector& xp, const Vector& drt, const Scalar& step_max,
216 |                            Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
217 |     {
218 |         using std::abs;
219 |         // std::cout << "========================= Entering line search =========================\n\n";
220 | 
221 |         // Check the value of step
222 |         if (step <= Scalar(0))
223 |             throw std::invalid_argument("'step' must be positive");
224 |         if (step > step_max)
225 |             throw std::invalid_argument("'step' exceeds 'step_max'");
226 | 
227 |         // Save the function value at the current x
228 |         const Scalar fx_init = fx;
229 |         // Projection of gradient on the search direction
230 |         const Scalar dg_init = dg;
231 | 
232 |         // std::cout << "fx_init = " << fx_init << ", dg_init = " << dg_init << std::endl << std::endl;
233 | 
234 |         // Make sure d points to a descent direction
235 |         if (dg_init >= Scalar(0))
236 |             throw std::logic_error("the moving direction does not decrease the objective function value");
237 | 
238 |         // Tolerance for convergence test
239 |         // Sufficient decrease
240 |         const Scalar test_decr = param.ftol * dg_init;
241 |         // Curvature
242 |         const Scalar test_curv = -param.wolfe * dg_init;
243 | 
244 |         // The bracketing interval
245 |         Scalar I_lo = Scalar(0), I_hi = std::numeric_limits<Scalar>::infinity();
246 |         Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits<Scalar>::infinity();
247 |         Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits<Scalar>::infinity();
248 |         // We also need to save x and grad for step=I_lo, since we want to return the best
249 |         // step size along the path when strong Wolfe condition is not met
250 |         Vector x_lo = xp, grad_lo = grad;
251 |         Scalar fx_lo = fx_init, dg_lo = dg_init;
252 | 
253 |         // Function value and gradient at the current step size
254 |         x.noalias() = xp + step * drt;
255 |         fx = f(x, grad);
256 |         dg = grad.dot(drt);
257 | 
258 |         // std::cout << "max_step = " << step_max << ", step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;
259 | 
260 |         // Convergence test
261 |         if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
262 |         {
263 |             // std::cout << "** Criteria met\n\n";
264 |             // std::cout << "========================= Leaving line search =========================\n\n";
265 |             return;
266 |         }
267 | 
268 |         // Extrapolation factor
269 |         const Scalar delta = Scalar(1.1);
270 |         int iter;
271 |         for (iter = 0; iter < param.max_linesearch; iter++)
272 |         {
273 |             // ft = psi(step) = f(xp + step * drt) - f(xp) - step * test_decr
274 |             // gt = psi'(step) = dg - mu * dg_init
275 |             // mu = param.ftol
276 |             const Scalar ft = fx - fx_init - step * test_decr;
277 |             const Scalar gt = dg - param.ftol * dg_init;
278 | 
279 |             // Update step size and bracketing interval
280 |             Scalar new_step;
281 |             if (ft > fI_lo)
282 |             {
283 |                 // Case 1: ft > fl
284 |                 new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
285 |                 // Sanity check: if the computed new_step is too small, typically due to
286 |                 // extremely large value of ft, switch to the middle point
287 |                 if (new_step <= param.min_step)
288 |                     new_step = (I_lo + step) / Scalar(2);
289 | 
290 |                 I_hi = step;
291 |                 fI_hi = ft;
292 |                 gI_hi = gt;
293 | 
294 |                 // std::cout << "Case 1: new step = " << new_step << std::endl;
295 |             }
296 |             else if (gt * (I_lo - step) > Scalar(0))
297 |             {
298 |                 // Case 2: ft <= fl, gt * (al - at) > 0
299 |                 //
300 |                 // Page 291 of Moré and Thuente (1994) suggests that
301 |                 // newat = min(at + delta * (at - al), amax), delta in [1.1, 4]
302 |                 new_step = std::min(step_max, step + delta * (step - I_lo));
303 | 
304 |                 // We can also consider the following scheme:
305 |                 // First let step_selection() decide a value, and then project to the range above
306 |                 //
307 |                 // new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
308 |                 // const Scalar delta2 = Scalar(4)
309 |                 // const Scalar t1 = step + delta * (step - I_lo);
310 |                 // const Scalar t2 = step + delta2 * (step - I_lo);
311 |                 // const Scalar tl = std::min(t1, t2), tu = std::max(t1, t2);
312 |                 // new_step = std::min(tu, std::max(tl, new_step));
313 |                 // new_step = std::min(step_max, new_step);
314 | 
315 |                 I_lo = step;
316 |                 fI_lo = ft;
317 |                 gI_lo = gt;
318 |                 // Move x and grad to x_lo and grad_lo, respectively
319 |                 x_lo.swap(x);
320 |                 grad_lo.swap(grad);
321 |                 fx_lo = fx;
322 |                 dg_lo = dg;
323 | 
324 |                 // std::cout << "Case 2: new step = " << new_step << std::endl;
325 |             }
326 |             else
327 |             {
328 |                 // Case 3: ft <= fl, gt * (al - at) <= 0
329 |                 new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
330 | 
331 |                 I_hi = I_lo;
332 |                 fI_hi = fI_lo;
333 |                 gI_hi = gI_lo;
334 | 
335 |                 I_lo = step;
336 |                 fI_lo = ft;
337 |                 gI_lo = gt;
338 |                 // Move x and grad to x_lo and grad_lo, respectively
339 |                 x_lo.swap(x);
340 |                 grad_lo.swap(grad);
341 |                 fx_lo = fx;
342 |                 dg_lo = dg;
343 | 
344 |                 // std::cout << "Case 3: new step = " << new_step << std::endl;
345 |             }
346 | 
347 |             // Case 1 and 3 are interpolations, whereas Case 2 is extrapolation
348 |             // This means that Case 2 may return new_step = step_max,
349 |             // and we need to decide whether to accept this value
350 |             // 1. If both step and new_step equal to step_max, it means
351 |             //    step will have no further change, so we accept it
352 |             // 2. Otherwise, we need to test the function value and gradient
353 |             //    on step_max, and decide later
354 | 
355 |             // In case step, new_step, and step_max are equal, directly return the computed x and fx
356 |             if (step == step_max && new_step >= step_max)
357 |             {
358 |                 // std::cout << "** Maximum step size reached\n\n";
359 |                 // std::cout << "========================= Leaving line search =========================\n\n";
360 | 
361 |                 // Move {x, grad}_lo back before returning
362 |                 x.swap(x_lo);
363 |                 grad.swap(grad_lo);
364 |                 return;
365 |             }
366 |             // Otherwise, recompute x and fx based on new_step
367 |             step = new_step;
368 | 
369 |             if (step < param.min_step)
370 |                 throw std::runtime_error("the line search step became smaller than the minimum value allowed");
371 | 
372 |             if (step > param.max_step)
373 |                 throw std::runtime_error("the line search step became larger than the maximum value allowed");
374 | 
375 |             // Update parameter, function value, and gradient
376 |             x.noalias() = xp + step * drt;
377 |             fx = f(x, grad);
378 |             dg = grad.dot(drt);
379 | 
380 |             // std::cout << "step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;
381 | 
382 |             // Convergence test
383 |             if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
384 |             {
385 |                 // std::cout << "** Criteria met\n\n";
386 |                 // std::cout << "========================= Leaving line search =========================\n\n";
387 |                 return;
388 |             }
389 | 
390 |             // Now assume step = step_max, and we need to decide whether to
391 |             // exit the line search (see the comments above regarding step_max)
392 |             // If we reach here, it means this step size does not pass the convergence
393 |             // test, so either the sufficient decrease condition or the curvature
394 |             // condition is not met yet
395 |             //
396 |             // Typically the curvature condition is harder to meet, and it is
397 |             // possible that no step size in [0, step_max] satisfies the condition
398 |             //
399 |             // But we need to make sure that its psi function value is smaller than
400 |             // the best one so far. If not, go to the next iteration and find a better one
401 |             if (step >= step_max)
402 |             {
403 |                 const Scalar ft_bound = fx - fx_init - step * test_decr;
404 |                 if (ft_bound <= fI_lo)
405 |                 {
406 |                     // std::cout << "** Maximum step size reached\n\n";
407 |                     // std::cout << "========================= Leaving line search =========================\n\n";
408 |                     return;
409 |                 }
410 |             }
411 |         }
412 | 
413 |         // If we have used up all line search iterations, then the strong Wolfe condition
414 |         // is not met. We choose not to raise an exception (unless no step satisfying
415 |         // sufficient decrease is found), but to return the best step size so far
416 |         if (iter >= param.max_linesearch)
417 |         {
418 |             // throw std::runtime_error("the line search routine reached the maximum number of iterations");
419 | 
420 |             // First test whether the last step is better than I_lo
421 |             // If yes, return the last step
422 |             const Scalar ft = fx - fx_init - step * test_decr;
423 |             if (ft <= fI_lo)
424 |                 return;
425 | 
426 |             // If not, then the best step size so far is I_lo, but it needs to be positive
427 |             if (I_lo <= Scalar(0))
428 |                 throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value");
429 | 
430 |             // Return everything with _lo
431 |             step = I_lo;
432 |             fx = fx_lo;
433 |             dg = dg_lo;
434 |             // Move {x, grad}_lo back
435 |             x.swap(x_lo);
436 |             grad.swap(grad_lo);
437 |             return;
438 |         }
439 |     }
440 | };
441 | 
442 | }  // namespace LBFGSpp
443 | 
444 | #endif  // LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
445 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/LineSearchNocedalWright.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2016-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Copyright (C) 2016-2025 Dirk Toewe <DirkToewe@GoogleMail.com>
  3 | // Under MIT license
  4 | 
  5 | #ifndef LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H
  6 | #define LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H
  7 | 
  8 | #include <Eigen/Core>
  9 | #include <stdexcept>
 10 | #include "Param.h"
 11 | 
 12 | namespace LBFGSpp {
 13 | 
 14 | ///
 15 | /// A line search algorithm for the strong Wolfe condition. Implementation based on:
 16 | ///
 17 | ///   "Numerical Optimization" 2nd Edition,
 18 | ///   Jorge Nocedal and Stephen J. Wright,
 19 | ///   Chapter 3. Line Search Methods, page 60.
 20 | ///
 21 | template <typename Scalar>
 22 | class LineSearchNocedalWright
 23 | {
 24 | private:
 25 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 26 | 
 27 |     // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
 28 |     // the function, and the fitted quadratic function is used to
 29 |     // estimate the minimum
 30 |     static Scalar quad_interp(const Scalar& step_lo, const Scalar& step_hi,
 31 |         const Scalar& fx_lo, const Scalar& fx_hi, const Scalar& dg_lo)
 32 |     {
 33 |         using std::abs;
 34 | 
 35 |         // polynomial: p (x) = c0*(x - step)² + c1
 36 |         // conditions: p (step_hi) = fx_hi
 37 |         //             p (step_lo) = fx_lo
 38 |         //             p'(step_lo) = dg_lo
 39 | 
 40 |         // We allow fx_hi to be Inf, so first compute a candidate for step size,
 41 |         // and test whether NaN occurs
 42 |         const Scalar fdiff = fx_hi - fx_lo;
 43 |         const Scalar sdiff = step_hi - step_lo;
 44 |         const Scalar smid = (step_hi + step_lo) / Scalar(2);
 45 |         Scalar step_candid = fdiff * step_lo - smid * sdiff * dg_lo;
 46 |         step_candid = step_candid / (fdiff - sdiff * dg_lo);
 47 | 
 48 |         // In some cases the interpolation is not a good choice
 49 |         // This includes (a) NaN values; (b) too close to the end points; (c) outside the interval
 50 |         // In such cases, a bisection search is used
 51 |         const bool candid_nan = !(std::isfinite(step_candid));
 52 |         const Scalar end_dist = std::min(abs(step_candid - step_lo), abs(step_candid - step_hi));
 53 |         const bool near_end = end_dist < Scalar(0.01) * abs(sdiff);
 54 |         const bool bisect = candid_nan ||
 55 |             (step_candid <= std::min(step_lo, step_hi)) ||
 56 |             (step_candid >= std::max(step_lo, step_hi)) ||
 57 |             near_end;
 58 |         const Scalar step = bisect ? smid : step_candid;
 59 |         return step;
 60 |     }
 61 | 
 62 | public:
 63 |     ///
 64 |     /// Line search by Nocedal and Wright (2006).
 65 |     ///
 66 |     /// \param f        A function object such that `f(x, grad)` returns the
 67 |     ///                 objective function value at `x`, and overwrites `grad` with
 68 |     ///                 the gradient.
 69 |     /// \param param    Parameters for the L-BFGS algorithm.
 70 |     /// \param xp       The current point.
 71 |     /// \param drt      The current moving direction.
 72 |     /// \param step_max The upper bound for the step size that makes x feasible.
 73 |     ///                 Can be ignored for the L-BFGS solver.
 74 |     /// \param step     In: The initial step length.
 75 |     ///                 Out: The calculated step length.
 76 |     /// \param fx       In: The objective function value at the current point.
 77 |     ///                 Out: The function value at the new point.
 78 |     /// \param grad     In: The current gradient vector.
 79 |     ///                 Out: The gradient at the new point.
 80 |     /// \param dg       In: The inner product between drt and grad.
 81 |     ///                 Out: The inner product between drt and the new gradient.
 82 |     /// \param x        Out: The new point moved to.
 83 |     ///
 84 |     template <typename Foo>
 85 |     static void LineSearch(Foo& f, const LBFGSParam<Scalar>& param,
 86 |                            const Vector& xp, const Vector& drt, const Scalar& step_max,
 87 |                            Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
 88 |     {
 89 |         // Check the value of step
 90 |         if (step <= Scalar(0))
 91 |             throw std::invalid_argument("'step' must be positive");
 92 | 
 93 |         if (param.linesearch != LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
 94 |             throw std::invalid_argument("'param.linesearch' must be 'LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE' for LineSearchNocedalWright");
 95 | 
 96 |         // To make this implementation more similar to the other line search
 97 |         // methods in LBFGSpp, the symbol names from the literature
 98 |         // ("Numerical Optimizations") have been changed.
 99 |         //
100 |         // Literature | LBFGSpp
101 |         // -----------|--------
102 |         // alpha      | step
103 |         // phi        | fx
104 |         // phi'       | dg
105 | 
106 |         // The expansion rate of the step size
107 |         const Scalar expansion = Scalar(2);
108 | 
109 |         // Save the function value at the current x
110 |         const Scalar fx_init = fx;
111 |         // Projection of gradient on the search direction
112 |         const Scalar dg_init = dg;
113 |         // Make sure d points to a descent direction
114 |         if (dg_init > Scalar(0))
115 |             throw std::logic_error("the moving direction increases the objective function value");
116 | 
117 |         const Scalar test_decr = param.ftol * dg_init,  // Sufficient decrease
118 |             test_curv = -param.wolfe * dg_init;         // Curvature
119 | 
120 |         // Ends of the line search range (step_lo > step_hi is allowed)
121 |         // We can also define dg_hi, but it will never be used
122 |         Scalar step_hi, fx_hi;
123 |         Scalar step_lo = Scalar(0), fx_lo = fx_init, dg_lo = dg_init;
124 |         // We also need to save x and grad for step=step_lo, since we want to return the best
125 |         // step size along the path when strong Wolfe condition is not met
126 |         Vector x_lo = xp, grad_lo = grad;
127 | 
128 |         // STEP 1: Bracketing Phase
129 |         //   Find a range guaranteed to contain a step satisfying strong Wolfe.
130 |         //   The bracketing phase exits if one of the following conditions is satisfied:
131 |         //   (1) Current step violates the sufficient decrease condition
132 |         //   (2) Current fx >= previous fx
133 |         //   (3) Current dg >= 0
134 |         //   (4) Strong Wolfe condition is met
135 |         //
136 |         //   (4) terminates the whole line search, and (1)-(3) go to the zoom phase
137 |         //
138 |         //   See also:
139 |         //     "Numerical Optimization", "Algorithm 3.5 (Line Search Algorithm)".
140 |         int iter = 0;
141 |         for (;;)
142 |         {
143 |             // Evaluate the current step size
144 |             x.noalias() = xp + step * drt;
145 |             fx = f(x, grad);
146 |             dg = grad.dot(drt);
147 | 
148 |             // Test the sufficient decrease condition
149 |             if (fx - fx_init > step * test_decr || (Scalar(0) < step_lo && fx >= fx_lo))
150 |             {
151 |                 // Case (1) and (2)
152 |                 step_hi = step;
153 |                 fx_hi = fx;
154 |                 // dg_hi = dg;
155 |                 break;
156 |             }
157 |             // If reaching here, then the sufficient decrease condition is satisfied
158 | 
159 |             // Test the curvature condition
160 |             if (std::abs(dg) <= test_curv)
161 |                 return;  // Case (4)
162 | 
163 |             step_hi = step_lo;
164 |             fx_hi = fx_lo;
165 |             // dg_hi = dg_lo;
166 |             step_lo = step;
167 |             fx_lo = fx;
168 |             dg_lo = dg;
169 |             // Move x and grad to x_lo and grad_lo, respectively
170 |             x_lo.swap(x);
171 |             grad_lo.swap(grad);
172 | 
173 |             if (dg >= Scalar(0))
174 |                 break;  // Case (3)
175 | 
176 |             iter++;
177 |             // If we have used up all line search iterations in the bracketing phase,
178 |             // it means every new step decreases the objective function. Of course,
179 |             // the strong Wolfe condition is not met, but we choose not to raise an
180 |             // exception; instead, we return the best step size so far. This means that
181 |             // we exit the line search with the most recent step size, which has the
182 |             // smallest objective function value during the line search
183 |             if (iter >= param.max_linesearch)
184 |             {
185 |                 // throw std::runtime_error("the line search routine reached the maximum number of iterations");
186 | 
187 |                 // At this point we can guarantee that {step, fx, dg}=={step, fx, dg}_lo
188 |                 // But we need to move {x, grad}_lo back before returning
189 |                 x.swap(x_lo);
190 |                 grad.swap(grad_lo);
191 |                 return;
192 |             }
193 | 
194 |             // If we still stay in the loop, it means we can expand the current step
195 |             step *= expansion;
196 |         }
197 | 
198 |         // STEP 2: Zoom Phase
199 |         //   Given a range (step_lo,step_hi) that is guaranteed to
200 |         //   contain a valid strong Wolfe step value, this method
201 |         //   finds such a value.
202 |         //
203 |         //   If step_lo > 0, then step_lo is, among all step sizes generated so far and
204 |         //   satisfying the sufficient decrease condition, the one giving the smallest
205 |         //   objective function value.
206 |         //
207 |         //   See also:
208 |         //     "Numerical Optimization", "Algorithm 3.6 (Zoom)".
209 |         for (;;)
210 |         {
211 |             // Use {fx_lo, fx_hi, dg_lo} to make a quadratic interpolation of
212 |             // the function, and the fitted quadratic function is used to
213 |             // estimate the minimum
214 |             step = quad_interp(step_lo, step_hi, fx_lo, fx_hi, dg_lo);
215 | 
216 |             // Evaluate the current step size
217 |             x.noalias() = xp + step * drt;
218 |             fx = f(x, grad);
219 |             dg = grad.dot(drt);
220 | 
221 |             // Test the sufficient decrease condition
222 |             if (fx - fx_init > step * test_decr || fx >= fx_lo)
223 |             {
224 |                 if (step == step_hi)
225 |                     throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision");
226 | 
227 |                 step_hi = step;
228 |                 fx_hi = fx;
229 |                 // dg_hi = dg;
230 |             }
231 |             else
232 |             {
233 |                 // Test the curvature condition
234 |                 if (std::abs(dg) <= test_curv)
235 |                     return;
236 | 
237 |                 if (dg * (step_hi - step_lo) >= Scalar(0))
238 |                 {
239 |                     step_hi = step_lo;
240 |                     fx_hi = fx_lo;
241 |                     // dg_hi = dg_lo;
242 |                 }
243 | 
244 |                 if (step == step_lo)
245 |                     throw std::runtime_error("the line search routine failed, possibly due to insufficient numeric precision");
246 | 
247 |                 // If reaching here, then the current step satisfies sufficient decrease condition
248 |                 step_lo = step;
249 |                 fx_lo = fx;
250 |                 dg_lo = dg;
251 |                 // Move x and grad to x_lo and grad_lo, respectively
252 |                 x_lo.swap(x);
253 |                 grad_lo.swap(grad);
254 |             }
255 | 
256 |             iter++;
257 |             // If we have used up all line search iterations in the zoom phase,
258 |             // then the strong Wolfe condition is not met. We choose not to raise an
259 |             // exception (unless no step satisfying sufficient decrease is found),
260 |             // but to return the best step size so far, i.e., step_lo
261 |             if (iter >= param.max_linesearch)
262 |             {
263 |                 // throw std::runtime_error("the line search routine reached the maximum number of iterations");
264 |                 if (step_lo <= Scalar(0))
265 |                     throw std::runtime_error("the line search routine failed, unable to sufficiently decrease the function value");
266 | 
267 |                 // Return everything with _lo
268 |                 step = step_lo;
269 |                 fx = fx_lo;
270 |                 dg = dg_lo;
271 |                 // Move {x, grad}_lo back
272 |                 x.swap(x_lo);
273 |                 grad.swap(grad_lo);
274 |                 return;
275 |             }
276 |         }
277 |     }
278 | };
279 | 
280 | }  // namespace LBFGSpp
281 | 
282 | #endif  // LBFGSPP_LINE_SEARCH_NOCEDAL_WRIGHT_H
283 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/Param.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2016-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_PARAM_H
  5 | #define LBFGSPP_PARAM_H
  6 | 
  7 | #include <Eigen/Core>
  8 | #include <stdexcept>  // std::invalid_argument
  9 | 
 10 | namespace LBFGSpp {
 11 | 
 12 | ///
 13 | /// \defgroup Enumerations
 14 | ///
 15 | /// Enumeration types for line search.
 16 | ///
 17 | 
 18 | ///
 19 | /// \ingroup Enumerations
 20 | ///
 21 | /// The enumeration of line search termination conditions.
 22 | ///
 23 | enum LINE_SEARCH_TERMINATION_CONDITION
 24 | {
 25 |     ///
 26 |     /// Backtracking method with the Armijo condition.
 27 |     /// The backtracking method finds the step length such that it satisfies
 28 |     /// the sufficient decrease (Armijo) condition,
 29 |     /// \f$f(x + a \cdot d) \le f(x) + \beta' \cdot a \cdot g(x)^T d\f$,
 30 |     /// where \f$x\f$ is the current point, \f$d\f$ is the current search direction,
 31 |     /// \f$a\f$ is the step length, and \f$\beta'\f$ is the value specified by
 32 |     /// \ref LBFGSParam::ftol. \f$f\f$ and \f$g\f$ are the function
 33 |     /// and gradient values respectively.
 34 |     ///
 35 |     LBFGS_LINESEARCH_BACKTRACKING_ARMIJO = 1,
 36 | 
 37 |     ///
 38 |     /// The backtracking method with the defualt (regular Wolfe) condition.
 39 |     /// An alias of `LBFGS_LINESEARCH_BACKTRACKING_WOLFE`.
 40 |     ///
 41 |     LBFGS_LINESEARCH_BACKTRACKING = 2,
 42 | 
 43 |     ///
 44 |     /// Backtracking method with regular Wolfe condition.
 45 |     /// The backtracking method finds the step length such that it satisfies
 46 |     /// both the Armijo condition (`LBFGS_LINESEARCH_BACKTRACKING_ARMIJO`)
 47 |     /// and the curvature condition,
 48 |     /// \f$g(x + a \cdot d)^T d \ge \beta \cdot g(x)^T d\f$, where \f$\beta\f$
 49 |     /// is the value specified by \ref LBFGSParam::wolfe.
 50 |     ///
 51 |     LBFGS_LINESEARCH_BACKTRACKING_WOLFE = 2,
 52 | 
 53 |     ///
 54 |     /// Backtracking method with strong Wolfe condition.
 55 |     /// The backtracking method finds the step length such that it satisfies
 56 |     /// both the Armijo condition (`LBFGS_LINESEARCH_BACKTRACKING_ARMIJO`)
 57 |     /// and the following condition,
 58 |     /// \f$\vert g(x + a \cdot d)^T d\vert \le \beta \cdot \vert g(x)^T d\vert\f$,
 59 |     /// where \f$\beta\f$ is the value specified by \ref LBFGSParam::wolfe.
 60 |     ///
 61 |     LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 3
 62 | };
 63 | 
 64 | ///
 65 | /// Parameters to control the L-BFGS algorithm.
 66 | ///
 67 | template <typename Scalar = double>
 68 | class LBFGSParam
 69 | {
 70 | public:
 71 |     ///
 72 |     /// The number of corrections to approximate the inverse Hessian matrix.
 73 |     /// The L-BFGS routine stores the computation results of previous \ref m
 74 |     /// iterations to approximate the inverse Hessian matrix of the current
 75 |     /// iteration. This parameter controls the size of the limited memories
 76 |     /// (corrections). The default value is \c 6. Values less than \c 3 are
 77 |     /// not recommended. Large values will result in excessive computing time.
 78 |     ///
 79 |     int m;
 80 |     ///
 81 |     /// Absolute tolerance for convergence test.
 82 |     /// This parameter determines the absolute accuracy \f$\epsilon_{abs}\f$
 83 |     /// with which the solution is to be found. A minimization terminates when
 84 |     /// \f$||g|| < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$,
 85 |     /// where \f$||\cdot||\f$ denotes the Euclidean (L2) norm. The default value is
 86 |     /// \c 1e-5.
 87 |     ///
 88 |     Scalar epsilon;
 89 |     ///
 90 |     /// Relative tolerance for convergence test.
 91 |     /// This parameter determines the relative accuracy \f$\epsilon_{rel}\f$
 92 |     /// with which the solution is to be found. A minimization terminates when
 93 |     /// \f$||g|| < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$,
 94 |     /// where \f$||\cdot||\f$ denotes the Euclidean (L2) norm. The default value is
 95 |     /// \c 1e-5.
 96 |     ///
 97 |     Scalar epsilon_rel;
 98 |     ///
 99 |     /// Distance for delta-based convergence test.
100 |     /// This parameter determines the distance \f$d\f$ to compute the
101 |     /// rate of decrease of the objective function,
102 |     /// \f$f_{k-d}(x)-f_k(x)\f$, where \f$k\f$ is the current iteration
103 |     /// step. If the value of this parameter is zero, the delta-based convergence
104 |     /// test will not be performed. The default value is \c 0.
105 |     ///
106 |     int past;
107 |     ///
108 |     /// Delta for convergence test.
109 |     /// The algorithm stops when the following condition is met,
110 |     /// \f$|f_{k-d}(x)-f_k(x)|<\delta\cdot\max(1, |f_k(x)|, |f_{k-d}(x)|)\f$, where \f$f_k(x)\f$ is
111 |     /// the current function value, and \f$f_{k-d}(x)\f$ is the function value
112 |     /// \f$d\f$ iterations ago (specified by the \ref past parameter).
113 |     /// The default value is \c 0.
114 |     ///
115 |     Scalar delta;
116 |     ///
117 |     /// The maximum number of iterations.
118 |     /// The optimization process is terminated when the iteration count
119 |     /// exceeds this parameter. Setting this parameter to zero continues an
120 |     /// optimization process until a convergence or error. The default value
121 |     /// is \c 0.
122 |     ///
123 |     int max_iterations;
124 |     ///
125 |     /// The line search termination condition.
126 |     /// This parameter specifies the line search termination condition that will be used
127 |     /// by the LBFGS routine. The default value is `LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE`.
128 |     ///
129 |     int linesearch;
130 |     ///
131 |     /// The maximum number of trials for the line search.
132 |     /// This parameter controls the number of function and gradients evaluations
133 |     /// per iteration for the line search routine. The default value is \c 20.
134 |     ///
135 |     int max_linesearch;
136 |     ///
137 |     /// The minimum step length allowed in the line search.
138 |     /// The default value is \c 1e-20. Usually this value does not need to be
139 |     /// modified.
140 |     ///
141 |     Scalar min_step;
142 |     ///
143 |     /// The maximum step length allowed in the line search.
144 |     /// The default value is \c 1e+20. Usually this value does not need to be
145 |     /// modified.
146 |     ///
147 |     Scalar max_step;
148 |     ///
149 |     /// A parameter to control the accuracy of the line search routine.
150 |     /// The default value is \c 1e-4. This parameter should be greater
151 |     /// than zero and smaller than \c 0.5.
152 |     ///
153 |     Scalar ftol;
154 |     ///
155 |     /// The coefficient for the Wolfe condition.
156 |     /// This parameter is valid only when the line-search
157 |     /// algorithm is used with the Wolfe condition.
158 |     /// The default value is \c 0.9. This parameter should be greater
159 |     /// the \ref ftol parameter and smaller than \c 1.0.
160 |     ///
161 |     Scalar wolfe;
162 | 
163 | public:
164 |     ///
165 |     /// Constructor for L-BFGS parameters.
166 |     /// Default values for parameters will be set when the object is created.
167 |     ///
168 |     LBFGSParam()
169 |     {
170 |         // clang-format off
171 |         m              = 6;
172 |         epsilon        = Scalar(1e-5);
173 |         epsilon_rel    = Scalar(1e-5);
174 |         past           = 0;
175 |         delta          = Scalar(0);
176 |         max_iterations = 0;
177 |         linesearch     = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE;
178 |         max_linesearch = 20;
179 |         min_step       = Scalar(1e-20);
180 |         max_step       = Scalar(1e+20);
181 |         ftol           = Scalar(1e-4);
182 |         wolfe          = Scalar(0.9);
183 |         // clang-format on
184 |     }
185 | 
186 |     ///
187 |     /// Checking the validity of L-BFGS parameters.
188 |     /// An `std::invalid_argument` exception will be thrown if some parameter
189 |     /// is invalid.
190 |     ///
191 |     inline void check_param() const
192 |     {
193 |         if (m <= 0)
194 |             throw std::invalid_argument("'m' must be positive");
195 |         if (epsilon < 0)
196 |             throw std::invalid_argument("'epsilon' must be non-negative");
197 |         if (epsilon_rel < 0)
198 |             throw std::invalid_argument("'epsilon_rel' must be non-negative");
199 |         if (past < 0)
200 |             throw std::invalid_argument("'past' must be non-negative");
201 |         if (delta < 0)
202 |             throw std::invalid_argument("'delta' must be non-negative");
203 |         if (max_iterations < 0)
204 |             throw std::invalid_argument("'max_iterations' must be non-negative");
205 |         if (linesearch < LBFGS_LINESEARCH_BACKTRACKING_ARMIJO ||
206 |             linesearch > LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
207 |             throw std::invalid_argument("unsupported line search termination condition");
208 |         if (max_linesearch <= 0)
209 |             throw std::invalid_argument("'max_linesearch' must be positive");
210 |         if (min_step < 0)
211 |             throw std::invalid_argument("'min_step' must be positive");
212 |         if (max_step < min_step)
213 |             throw std::invalid_argument("'max_step' must be greater than 'min_step'");
214 |         if (ftol <= 0 || ftol >= 0.5)
215 |             throw std::invalid_argument("'ftol' must satisfy 0 < ftol < 0.5");
216 |         if (wolfe <= ftol || wolfe >= 1)
217 |             throw std::invalid_argument("'wolfe' must satisfy ftol < wolfe < 1");
218 |     }
219 | };
220 | 
221 | ///
222 | /// Parameters to control the L-BFGS-B algorithm.
223 | ///
224 | template <typename Scalar = double>
225 | class LBFGSBParam
226 | {
227 | public:
228 |     ///
229 |     /// The number of corrections to approximate the inverse Hessian matrix.
230 |     /// The L-BFGS-B routine stores the computation results of previous \ref m
231 |     /// iterations to approximate the inverse Hessian matrix of the current
232 |     /// iteration. This parameter controls the size of the limited memories
233 |     /// (corrections). The default value is \c 6. Values less than \c 3 are
234 |     /// not recommended. Large values will result in excessive computing time.
235 |     ///
236 |     int m;
237 |     ///
238 |     /// Absolute tolerance for convergence test.
239 |     /// This parameter determines the absolute accuracy \f$\epsilon_{abs}\f$
240 |     /// with which the solution is to be found. A minimization terminates when
241 |     /// \f$||Pg||_{\infty} < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$,
242 |     /// where \f$||x||\f$ denotes the Euclidean (L2) norm of \f$x\f$, and
243 |     /// \f$Pg=P(x-g,l,u)-x\f$ is the projected gradient. The default value is
244 |     /// \c 1e-5.
245 |     ///
246 |     Scalar epsilon;
247 |     ///
248 |     /// Relative tolerance for convergence test.
249 |     /// This parameter determines the relative accuracy \f$\epsilon_{rel}\f$
250 |     /// with which the solution is to be found. A minimization terminates when
251 |     /// \f$||Pg||_{\infty} < \max\{\epsilon_{abs}, \epsilon_{rel}||x||\}\f$,
252 |     /// where \f$||x||\f$ denotes the Euclidean (L2) norm of \f$x\f$, and
253 |     /// \f$Pg=P(x-g,l,u)-x\f$ is the projected gradient. The default value is
254 |     /// \c 1e-5.
255 |     ///
256 |     Scalar epsilon_rel;
257 |     ///
258 |     /// Distance for delta-based convergence test.
259 |     /// This parameter determines the distance \f$d\f$ to compute the
260 |     /// rate of decrease of the objective function,
261 |     /// \f$f_{k-d}(x)-f_k(x)\f$, where \f$k\f$ is the current iteration
262 |     /// step. If the value of this parameter is zero, the delta-based convergence
263 |     /// test will not be performed. The default value is \c 1.
264 |     ///
265 |     int past;
266 |     ///
267 |     /// Delta for convergence test.
268 |     /// The algorithm stops when the following condition is met,
269 |     /// \f$|f_{k-d}(x)-f_k(x)|<\delta\cdot\max(1, |f_k(x)|, |f_{k-d}(x)|)\f$, where \f$f_k(x)\f$ is
270 |     /// the current function value, and \f$f_{k-d}(x)\f$ is the function value
271 |     /// \f$d\f$ iterations ago (specified by the \ref past parameter).
272 |     /// The default value is \c 1e-10.
273 |     ///
274 |     Scalar delta;
275 |     ///
276 |     /// The maximum number of iterations.
277 |     /// The optimization process is terminated when the iteration count
278 |     /// exceeds this parameter. Setting this parameter to zero continues an
279 |     /// optimization process until a convergence or error. The default value
280 |     /// is \c 0.
281 |     ///
282 |     int max_iterations;
283 |     ///
284 |     /// The maximum number of iterations in the subspace minimization.
285 |     /// This parameter controls the number of iterations in the subspace
286 |     /// minimization routine. The default value is \c 10.
287 |     ///
288 |     int max_submin;
289 |     ///
290 |     /// The maximum number of trials for the line search.
291 |     /// This parameter controls the number of function and gradients evaluations
292 |     /// per iteration for the line search routine. The default value is \c 20.
293 |     ///
294 |     int max_linesearch;
295 |     ///
296 |     /// The minimum step length allowed in the line search.
297 |     /// The default value is \c 1e-20. Usually this value does not need to be
298 |     /// modified.
299 |     ///
300 |     Scalar min_step;
301 |     ///
302 |     /// The maximum step length allowed in the line search.
303 |     /// The default value is \c 1e+20. Usually this value does not need to be
304 |     /// modified.
305 |     ///
306 |     Scalar max_step;
307 |     ///
308 |     /// A parameter to control the accuracy of the line search routine.
309 |     /// The default value is \c 1e-4. This parameter should be greater
310 |     /// than zero and smaller than \c 0.5.
311 |     ///
312 |     Scalar ftol;
313 |     ///
314 |     /// The coefficient for the Wolfe condition.
315 |     /// This parameter is valid only when the line-search
316 |     /// algorithm is used with the Wolfe condition.
317 |     /// The default value is \c 0.9. This parameter should be greater
318 |     /// the \ref ftol parameter and smaller than \c 1.0.
319 |     ///
320 |     Scalar wolfe;
321 | 
322 | public:
323 |     ///
324 |     /// Constructor for L-BFGS-B parameters.
325 |     /// Default values for parameters will be set when the object is created.
326 |     ///
327 |     LBFGSBParam()
328 |     {
329 |         // clang-format off
330 |         m              = 6;
331 |         epsilon        = Scalar(1e-5);
332 |         epsilon_rel    = Scalar(1e-5);
333 |         past           = 1;
334 |         delta          = Scalar(1e-10);
335 |         max_iterations = 0;
336 |         max_submin     = 10;
337 |         max_linesearch = 20;
338 |         min_step       = Scalar(1e-20);
339 |         max_step       = Scalar(1e+20);
340 |         ftol           = Scalar(1e-4);
341 |         wolfe          = Scalar(0.9);
342 |         // clang-format on
343 |     }
344 | 
345 |     ///
346 |     /// Checking the validity of L-BFGS-B parameters.
347 |     /// An `std::invalid_argument` exception will be thrown if some parameter
348 |     /// is invalid.
349 |     ///
350 |     inline void check_param() const
351 |     {
352 |         if (m <= 0)
353 |             throw std::invalid_argument("'m' must be positive");
354 |         if (epsilon < 0)
355 |             throw std::invalid_argument("'epsilon' must be non-negative");
356 |         if (epsilon_rel < 0)
357 |             throw std::invalid_argument("'epsilon_rel' must be non-negative");
358 |         if (past < 0)
359 |             throw std::invalid_argument("'past' must be non-negative");
360 |         if (delta < 0)
361 |             throw std::invalid_argument("'delta' must be non-negative");
362 |         if (max_iterations < 0)
363 |             throw std::invalid_argument("'max_iterations' must be non-negative");
364 |         if (max_submin < 0)
365 |             throw std::invalid_argument("'max_submin' must be non-negative");
366 |         if (max_linesearch <= 0)
367 |             throw std::invalid_argument("'max_linesearch' must be positive");
368 |         if (min_step < 0)
369 |             throw std::invalid_argument("'min_step' must be positive");
370 |         if (max_step < min_step)
371 |             throw std::invalid_argument("'max_step' must be greater than 'min_step'");
372 |         if (ftol <= 0 || ftol >= 0.5)
373 |             throw std::invalid_argument("'ftol' must satisfy 0 < ftol < 0.5");
374 |         if (wolfe <= ftol || wolfe >= 1)
375 |             throw std::invalid_argument("'wolfe' must satisfy ftol < wolfe < 1");
376 |     }
377 | };
378 | 
379 | }  // namespace LBFGSpp
380 | 
381 | #endif  // LBFGSPP_PARAM_H
382 | 


--------------------------------------------------------------------------------
/include/LBFGSpp/SubspaceMin.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2020-2025 Yixuan Qiu <yixuan.qiu@cos.name>
  2 | // Under MIT license
  3 | 
  4 | #ifndef LBFGSPP_SUBSPACE_MIN_H
  5 | #define LBFGSPP_SUBSPACE_MIN_H
  6 | 
  7 | #include <stdexcept>
  8 | #include <vector>
  9 | #include <Eigen/Core>
 10 | #include "BFGSMat.h"
 11 | 
 12 | /// \cond
 13 | 
 14 | namespace LBFGSpp {
 15 | 
 16 | //
 17 | // Subspace minimization procedure of the L-BFGS-B algorithm,
 18 | // mainly for internal use.
 19 | //
 20 | // The target of subspace minimization is to minimize the quadratic function m(x)
 21 | // over the free variables, subject to the bound condition.
 22 | // Free variables stand for coordinates that are not at the boundary in xcp,
 23 | // the generalized Cauchy point.
 24 | //
 25 | // In the classical implementation of L-BFGS-B [1], the minimization is done by first
 26 | // ignoring the box constraints, followed by a line search. Our implementation is
 27 | // an exact minimization subject to the bounds, based on the BOXCQP algorithm [2].
 28 | //
 29 | // Reference:
 30 | // [1] R. H. Byrd, P. Lu, and J. Nocedal (1995). A limited memory algorithm for bound constrained optimization.
 31 | // [2] C. Voglis and I. E. Lagaris (2004). BOXCQP: An algorithm for bound constrained convex quadratic problems.
 32 | //
 33 | template <typename Scalar>
 34 | class SubspaceMin
 35 | {
 36 | private:
 37 |     using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
 38 |     using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
 39 |     using IndexSet = std::vector<int>;
 40 | 
 41 |     // v[ind]
 42 |     static Vector subvec(const Vector& v, const IndexSet& ind)
 43 |     {
 44 |         const int nsub = ind.size();
 45 |         Vector res(nsub);
 46 |         for (int i = 0; i < nsub; i++)
 47 |             res[i] = v[ind[i]];
 48 |         return res;
 49 |     }
 50 | 
 51 |     // v[ind] = rhs
 52 |     static void subvec_assign(Vector& v, const IndexSet& ind, const Vector& rhs)
 53 |     {
 54 |         const int nsub = ind.size();
 55 |         for (int i = 0; i < nsub; i++)
 56 |             v[ind[i]] = rhs[i];
 57 |     }
 58 | 
 59 |     // Check whether the vector is within the bounds
 60 |     static bool in_bounds(const Vector& x, const Vector& lb, const Vector& ub)
 61 |     {
 62 |         const int n = x.size();
 63 |         for (int i = 0; i < n; i++)
 64 |         {
 65 |             if (x[i] < lb[i] || x[i] > ub[i])
 66 |                 return false;
 67 |         }
 68 |         return true;
 69 |     }
 70 | 
 71 |     // Test convergence of P set
 72 |     static bool P_converged(const IndexSet& yP_set, const Vector& vecy, const Vector& vecl, const Vector& vecu)
 73 |     {
 74 |         const int nP = yP_set.size();
 75 |         for (int i = 0; i < nP; i++)
 76 |         {
 77 |             const int coord = yP_set[i];
 78 |             if (vecy[coord] < vecl[coord] || vecy[coord] > vecu[coord])
 79 |                 return false;
 80 |         }
 81 |         return true;
 82 |     }
 83 | 
 84 |     // Test convergence of L set
 85 |     static bool L_converged(const IndexSet& yL_set, const Vector& lambda)
 86 |     {
 87 |         const int nL = yL_set.size();
 88 |         for (int i = 0; i < nL; i++)
 89 |         {
 90 |             const int coord = yL_set[i];
 91 |             if (lambda[coord] < Scalar(0))
 92 |                 return false;
 93 |         }
 94 |         return true;
 95 |     }
 96 | 
 97 |     // Test convergence of L set
 98 |     static bool U_converged(const IndexSet& yU_set, const Vector& mu)
 99 |     {
100 |         const int nU = yU_set.size();
101 |         for (int i = 0; i < nU; i++)
102 |         {
103 |             const int coord = yU_set[i];
104 |             if (mu[coord] < Scalar(0))
105 |                 return false;
106 |         }
107 |         return true;
108 |     }
109 | 
110 | public:
111 |     // bfgs:       An object that represents the BFGS approximation matrix.
112 |     // x0:         Current parameter vector.
113 |     // xcp:        Computed generalized Cauchy point.
114 |     // g:          Gradient at x0.
115 |     // lb:         Lower bounds for x.
116 |     // ub:         Upper bounds for x.
117 |     // Wd:         W'(xcp - x0)
118 |     // newact_set: Coordinates that newly become active during the GCP procedure.
119 |     // fv_set:     Free variable set.
120 |     // maxit:      Maximum number of iterations.
121 |     // drt:        The output direction vector, drt = xsm - x0.
122 |     static void subspace_minimize(
123 |         const BFGSMat<Scalar, true>& bfgs, const Vector& x0, const Vector& xcp, const Vector& g,
124 |         const Vector& lb, const Vector& ub, const Vector& Wd, const IndexSet& newact_set, const IndexSet& fv_set, int maxit,
125 |         Vector& drt)
126 |     {
127 |         // std::cout << "========================= Entering subspace minimization =========================\n\n";
128 | 
129 |         // d = xcp - x0
130 |         drt.noalias() = xcp - x0;
131 |         // Size of free variables
132 |         const int nfree = fv_set.size();
133 |         // If there is no free variable, simply return drt
134 |         if (nfree < 1)
135 |         {
136 |             // std::cout << "========================= (Early) leaving subspace minimization =========================\n\n";
137 |             return;
138 |         }
139 | 
140 |         // std::cout << "New active set = [ "; for(std::size_t i = 0; i < newact_set.size(); i++)  std::cout << newact_set[i] << " "; std::cout << "]\n";
141 |         // std::cout << "Free variable set = [ "; for(std::size_t i = 0; i < fv_set.size(); i++)  std::cout << fv_set[i] << " "; std::cout << "]\n\n";
142 | 
143 |         // Extract the rows of W in the free variable set
144 |         Matrix WF = bfgs.Wb(fv_set);
145 |         // Compute F'BAb = -F'WMW'AA'd
146 |         Vector vecc(nfree);
147 |         bfgs.compute_FtBAb(WF, fv_set, newact_set, Wd, drt, vecc);
148 |         // Set the vector c=F'BAb+F'g for linear term, and vectors l and u for the new bounds
149 |         Vector vecl(nfree), vecu(nfree);
150 |         for (int i = 0; i < nfree; i++)
151 |         {
152 |             const int coord = fv_set[i];
153 |             vecl[i] = lb[coord] - x0[coord];
154 |             vecu[i] = ub[coord] - x0[coord];
155 |             vecc[i] += g[coord];
156 |         }
157 |         // Solve y = -inv(B[F, F]) * c
158 |         Vector vecy(nfree);
159 |         bfgs.solve_PtBP(WF, -vecc, vecy);
160 |         // Test feasibility
161 |         // If yes, then the solution has been found
162 |         if (in_bounds(vecy, vecl, vecu))
163 |         {
164 |             subvec_assign(drt, fv_set, vecy);
165 |             return;
166 |         }
167 |         // Otherwise, enter the iterations
168 | 
169 |         // Make a copy of y as a fallback solution
170 |         Vector yfallback = vecy;
171 |         // Dual variables
172 |         Vector lambda = Vector::Zero(nfree), mu = Vector::Zero(nfree);
173 | 
174 |         // Iterations
175 |         IndexSet L_set, U_set, P_set, yL_set, yU_set, yP_set;
176 |         L_set.reserve(nfree / 3);
177 |         yL_set.reserve(nfree / 3);
178 |         U_set.reserve(nfree / 3);
179 |         yU_set.reserve(nfree / 3);
180 |         P_set.reserve(nfree);
181 |         yP_set.reserve(nfree);
182 |         int k;
183 |         for (k = 0; k < maxit; k++)
184 |         {
185 |             // Construct the L, U, and P sets, and then update values
186 |             // Indices in original drt vector
187 |             L_set.clear();
188 |             U_set.clear();
189 |             P_set.clear();
190 |             // Indices in y
191 |             yL_set.clear();
192 |             yU_set.clear();
193 |             yP_set.clear();
194 |             for (int i = 0; i < nfree; i++)
195 |             {
196 |                 const int coord = fv_set[i];
197 |                 const Scalar li = vecl[i], ui = vecu[i];
198 |                 if ((vecy[i] < li) || (vecy[i] == li && lambda[i] >= Scalar(0)))
199 |                 {
200 |                     L_set.push_back(coord);
201 |                     yL_set.push_back(i);
202 |                     vecy[i] = li;
203 |                     mu[i] = Scalar(0);
204 |                 }
205 |                 else if ((vecy[i] > ui) || (vecy[i] == ui && mu[i] >= Scalar(0)))
206 |                 {
207 |                     U_set.push_back(coord);
208 |                     yU_set.push_back(i);
209 |                     vecy[i] = ui;
210 |                     lambda[i] = Scalar(0);
211 |                 }
212 |                 else
213 |                 {
214 |                     P_set.push_back(coord);
215 |                     yP_set.push_back(i);
216 |                     lambda[i] = Scalar(0);
217 |                     mu[i] = Scalar(0);
218 |                 }
219 |             }
220 | 
221 |             /* std::cout << "** Iter " << k << " **\n";
222 |             std::cout << "   L = [ "; for(std::size_t i = 0; i < L_set.size(); i++)  std::cout << L_set[i] << " "; std::cout << "]\n";
223 |             std::cout << "   U = [ "; for(std::size_t i = 0; i < U_set.size(); i++)  std::cout << U_set[i] << " "; std::cout << "]\n";
224 |             std::cout << "   P = [ "; for(std::size_t i = 0; i < P_set.size(); i++)  std::cout << P_set[i] << " "; std::cout << "]\n\n"; */
225 | 
226 |             // Extract the rows of W in the P set
227 |             Matrix WP = bfgs.Wb(P_set);
228 |             // Solve y[P] = -inv(B[P, P]) * (B[P, L] * l[L] + B[P, U] * u[U] + c[P])
229 |             const int nP = P_set.size();
230 |             if (nP > 0)
231 |             {
232 |                 Vector rhs = subvec(vecc, yP_set);
233 |                 Vector lL = subvec(vecl, yL_set);
234 |                 Vector uU = subvec(vecu, yU_set);
235 |                 Vector tmp(nP);
236 |                 bool nonzero = bfgs.apply_PtBQv(WP, L_set, lL, tmp, true);
237 |                 if (nonzero)
238 |                     rhs.noalias() += tmp;
239 |                 nonzero = bfgs.apply_PtBQv(WP, U_set, uU, tmp, true);
240 |                 if (nonzero)
241 |                     rhs.noalias() += tmp;
242 | 
243 |                 bfgs.solve_PtBP(WP, -rhs, tmp);
244 |                 subvec_assign(vecy, yP_set, tmp);
245 |             }
246 | 
247 |             // Solve lambda[L] = B[L, F] * y + c[L]
248 |             const int nL = L_set.size();
249 |             const int nU = U_set.size();
250 |             Vector Fy;
251 |             if (nL > 0 || nU > 0)
252 |                 bfgs.apply_WtPv(fv_set, vecy, Fy);
253 |             if (nL > 0)
254 |             {
255 |                 Vector res;
256 |                 bfgs.apply_PtWMv(L_set, Fy, res, Scalar(-1));
257 |                 res.noalias() += subvec(vecc, yL_set) + bfgs.theta() * subvec(vecy, yL_set);
258 |                 subvec_assign(lambda, yL_set, res);
259 |             }
260 | 
261 |             // Solve mu[U] = -B[U, F] * y - c[U]
262 |             if (nU > 0)
263 |             {
264 |                 Vector negRes;
265 |                 bfgs.apply_PtWMv(U_set, Fy, negRes, Scalar(-1));
266 |                 negRes.noalias() += subvec(vecc, yU_set) + bfgs.theta() * subvec(vecy, yU_set);
267 |                 subvec_assign(mu, yU_set, -negRes);
268 |             }
269 | 
270 |             // Test convergence
271 |             if (L_converged(yL_set, lambda) && U_converged(yU_set, mu) && P_converged(yP_set, vecy, vecl, vecu))
272 |                 break;
273 |         }
274 | 
275 |         // If the iterations do not converge, try the projection
276 |         if (k >= maxit)
277 |         {
278 |             vecy.noalias() = vecy.cwiseMax(vecl).cwiseMin(vecu);
279 |             subvec_assign(drt, fv_set, vecy);
280 |             // Test whether drt is a descent direction
281 |             Scalar dg = drt.dot(g);
282 |             // If yes, return the result
283 |             if (dg <= -std::numeric_limits<Scalar>::epsilon())
284 |                 return;
285 | 
286 |             // If not, fall back to the projected unconstrained solution
287 |             vecy.noalias() = yfallback.cwiseMax(vecl).cwiseMin(vecu);
288 |             subvec_assign(drt, fv_set, vecy);
289 |             dg = drt.dot(g);
290 |             if (dg <= -std::numeric_limits<Scalar>::epsilon())
291 |                 return;
292 | 
293 |             // If still not, fall back to the unconstrained solution
294 |             subvec_assign(drt, fv_set, yfallback);
295 |             return;
296 |         }
297 | 
298 |         // std::cout << "** Minimization finished in " << k + 1 << " iteration(s) **\n\n";
299 |         // std::cout << "========================= Leaving subspace minimization =========================\n\n";
300 | 
301 |         subvec_assign(drt, fv_set, vecy);
302 |     }
303 | };
304 | 
305 | }  // namespace LBFGSpp
306 | 
307 | /// \endcond
308 | 
309 | #endif  // LBFGSPP_SUBSPACE_MIN_H
310 | 


--------------------------------------------------------------------------------