LBFGS++
Loading...
Searching...
No Matches
LineSearchMoreThuente.h
1// Copyright (C) 2020-2023 Yixuan Qiu <yixuan.qiu@cos.name>
2// Under MIT license
3
4#ifndef LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
5#define LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
6
7#include <stdexcept> // std::invalid_argument, std::runtime_error
8#include <Eigen/Core>
9#include "LBFGSpp/Param.h"
10
11namespace LBFGSpp {
12
25template <typename Scalar>
27{
28private:
29 using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
30
31 // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
32 // that interpolates fa, ga, and fb, assuming the minimizer exists
33 // For case I: fb >= fa and ga * (b - a) < 0
34 static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& ga, const Scalar& fb)
35 {
36 const Scalar ba = b - a;
37 const Scalar w = Scalar(0.5) * ba * ga / (fa - fb + ba * ga);
38 return a + w * ba;
39 }
40
41 // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
42 // that interpolates fa, ga and gb, assuming the minimizer exists
43 // The result actually does not depend on fa
44 // For case II: ga * (b - a) < 0, ga * gb < 0
45 // For case III: ga * (b - a) < 0, ga * ga >= 0, |gb| <= |ga|
46 static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& ga, const Scalar& gb)
47 {
48 const Scalar w = ga / (ga - gb);
49 return a + w * (b - a);
50 }
51
52 // Local minimizer of a cubic function q(x) = c0 + c1 * x + c2 * x^2 + c3 * x^3
53 // that interpolates fa, ga, fb and gb, assuming a != b
54 // Also sets a flag indicating whether the minimizer exists
55 static Scalar cubic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& fb,
56 const Scalar& ga, const Scalar& gb, bool& exists)
57 {
58 using std::abs;
59 using std::sqrt;
60
61 const Scalar apb = a + b;
62 const Scalar ba = b - a;
63 const Scalar ba2 = ba * ba;
64 const Scalar fba = fb - fa;
65 const Scalar gba = gb - ga;
66 // z3 = c3 * (b-a)^3, z2 = c2 * (b-a)^3, z1 = c1 * (b-a)^3
67 const Scalar z3 = (ga + gb) * ba - Scalar(2) * fba;
68 const Scalar z2 = Scalar(0.5) * (gba * ba2 - Scalar(3) * apb * z3);
69 const Scalar z1 = fba * ba2 - apb * z2 - (a * apb + b * b) * z3;
70 // std::cout << "z1 = " << z1 << ", z2 = " << z2 << ", z3 = " << z3 << std::endl;
71
72 // If c3 = z/(b-a)^3 == 0, reduce to quadratic problem
73 const Scalar eps = std::numeric_limits<Scalar>::epsilon();
74 if (abs(z3) < eps * abs(z2) || abs(z3) < eps * abs(z1))
75 {
76 // Minimizer exists if c2 > 0
77 exists = (z2 * ba > Scalar(0));
78 // Return the end point if the minimizer does not exist
79 return exists ? (-Scalar(0.5) * z1 / z2) : b;
80 }
81
82 // Now we can assume z3 > 0
83 // The minimizer is a solution to the equation c1 + 2*c2 * x + 3*c3 * x^2 = 0
84 // roots = -(z2/z3) / 3 (+-) sqrt((z2/z3)^2 - 3 * (z1/z3)) / 3
85 //
86 // Let u = z2/(3z3) and v = z1/z2
87 // The minimizer exists if v/u <= 1
88 const Scalar u = z2 / (Scalar(3) * z3), v = z1 / z2;
89 const Scalar vu = v / u;
90 exists = (vu <= Scalar(1));
91 if (!exists)
92 return b;
93
94 // We need to find a numerically stable way to compute the roots, as z3 may still be small
95 //
96 // If |u| >= |v|, let w = 1 + sqrt(1-v/u), and then
97 // r1 = -u * w, r2 = -v / w, r1 does not need to be the smaller one
98 //
99 // If |u| < |v|, we must have uv <= 0, and then
100 // r = -u (+-) sqrt(delta), where
101 // sqrt(delta) = sqrt(|u|) * sqrt(|v|) * sqrt(1-u/v)
102 Scalar r1 = Scalar(0), r2 = Scalar(0);
103 if (abs(u) >= abs(v))
104 {
105 const Scalar w = Scalar(1) + sqrt(Scalar(1) - vu);
106 r1 = -u * w;
107 r2 = -v / w;
108 }
109 else
110 {
111 const Scalar sqrtd = sqrt(abs(u)) * sqrt(abs(v)) * sqrt(1 - u / v);
112 r1 = -u - sqrtd;
113 r2 = -u + sqrtd;
114 }
115 return (z3 * ba > Scalar(0)) ? ((std::max)(r1, r2)) : ((std::min)(r1, r2));
116 }
117
118 // Select the next step size according to the current step sizes,
119 // function values, and derivatives
120 static Scalar step_selection(
121 const Scalar& al, const Scalar& au, const Scalar& at,
122 const Scalar& fl, const Scalar& fu, const Scalar& ft,
123 const Scalar& gl, const Scalar& gu, const Scalar& gt)
124 {
125 using std::abs;
126
127 if (al == au)
128 return al;
129
130 // If ft = Inf or gt = Inf, we return the middle point of al and at
131 if (!std::isfinite(ft) || !std::isfinite(gt))
132 return (al + at) / Scalar(2);
133
134 // ac: cubic interpolation of fl, ft, gl, gt
135 // aq: quadratic interpolation of fl, gl, ft
136 bool ac_exists;
137 // std::cout << "al = " << al << ", at = " << at << ", fl = " << fl << ", ft = " << ft << ", gl = " << gl << ", gt = " << gt << std::endl;
138 const Scalar ac = cubic_minimizer(al, at, fl, ft, gl, gt, ac_exists);
139 const Scalar aq = quadratic_minimizer(al, at, fl, gl, ft);
140 // std::cout << "ac = " << ac << ", aq = " << aq << std::endl;
141 // Case 1: ft > fl
142 if (ft > fl)
143 {
144 // This should not happen if ft > fl, but just to be safe
145 if (!ac_exists)
146 return aq;
147 // Then use the scheme described in the paper
148 return (abs(ac - al) < abs(aq - al)) ? ac : ((aq + ac) / Scalar(2));
149 }
150
151 // as: quadratic interpolation of gl and gt
152 const Scalar as = quadratic_minimizer(al, at, gl, gt);
153 // Case 2: ft <= fl, gt * gl < 0
154 if (gt * gl < Scalar(0))
155 return (abs(ac - at) >= abs(as - at)) ? ac : as;
156
157 // Case 3: ft <= fl, gt * gl >= 0, |gt| < |gl|
158 const Scalar deltal = Scalar(1.1), deltau = Scalar(0.66);
159 if (abs(gt) < abs(gl))
160 {
161 // We choose either ac or as
162 // The case for ac: 1. It exists, and
163 // 2. ac is farther than at from al, and
164 // 3. ac is closer to at than as
165 // Cases for as: otherwise
166 const Scalar res = (ac_exists &&
167 (ac - at) * (at - al) > Scalar(0) &&
168 abs(ac - at) < abs(as - at)) ?
169 ac :
170 as;
171 // Postprocessing the chosen step
172 return (at > al) ?
173 std::min(at + deltau * (au - at), res) :
174 std::max(at + deltau * (au - at), res);
175 }
176
177 // Simple extrapolation if au, fu, or gu is infinity
178 if ((!std::isfinite(au)) || (!std::isfinite(fu)) || (!std::isfinite(gu)))
179 return at + deltal * (at - al);
180
181 // ae: cubic interpolation of ft, fu, gt, gu
182 bool ae_exists;
183 const Scalar ae = cubic_minimizer(at, au, ft, fu, gt, gu, ae_exists);
184 // Case 4: ft <= fl, gt * gl >= 0, |gt| >= |gl|
185 // The following is not used in the paper, but it seems to be a reasonable safeguard
186 return (at > al) ?
187 std::min(at + deltau * (au - at), ae) :
188 std::max(at + deltau * (au - at), ae);
189 }
190
191public:
213 template <typename Foo, typename SolverParam>
214 static void LineSearch(Foo& f, const SolverParam& param,
215 const Vector& xp, const Vector& drt, const Scalar& step_max,
216 Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
217 {
218 using std::abs;
219 // std::cout << "========================= Entering line search =========================\n\n";
220
221 // Check the value of step
222 if (step <= Scalar(0))
223 throw std::invalid_argument("'step' must be positive");
224 if (step > step_max)
225 throw std::invalid_argument("'step' exceeds 'step_max'");
226
227 // Save the function value at the current x
228 const Scalar fx_init = fx;
229 // Projection of gradient on the search direction
230 const Scalar dg_init = dg;
231
232 // std::cout << "fx_init = " << fx_init << ", dg_init = " << dg_init << std::endl << std::endl;
233
234 // Make sure d points to a descent direction
235 if (dg_init >= Scalar(0))
236 throw std::logic_error("the moving direction does not decrease the objective function value");
237
238 // Tolerance for convergence test
239 // Sufficient decrease
240 const Scalar test_decr = param.ftol * dg_init;
241 // Curvature
242 const Scalar test_curv = -param.wolfe * dg_init;
243
244 // The bracketing interval
245 Scalar I_lo = Scalar(0), I_hi = std::numeric_limits<Scalar>::infinity();
246 Scalar fI_lo = Scalar(0), fI_hi = std::numeric_limits<Scalar>::infinity();
247 Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = std::numeric_limits<Scalar>::infinity();
248 // We also need to save x and grad for step=I_lo, since we want to return the best
249 // step size along the path when strong Wolfe condition is not met
250 Vector x_lo = xp, grad_lo = grad;
251 Scalar fx_lo = fx_init, dg_lo = dg_init;
252
253 // Function value and gradient at the current step size
254 x.noalias() = xp + step * drt;
255 fx = f(x, grad);
256 dg = grad.dot(drt);
257
258 // std::cout << "max_step = " << step_max << ", step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;
259
260 // Convergence test
261 if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
262 {
263 // std::cout << "** Criteria met\n\n";
264 // std::cout << "========================= Leaving line search =========================\n\n";
265 return;
266 }
267
268 // Extrapolation factor
269 const Scalar delta = Scalar(1.1);
270 int iter;
271 for (iter = 0; iter < param.max_linesearch; iter++)
272 {
273 // ft = psi(step) = f(xp + step * drt) - f(xp) - step * test_decr
274 // gt = psi'(step) = dg - mu * dg_init
275 // mu = param.ftol
276 const Scalar ft = fx - fx_init - step * test_decr;
277 const Scalar gt = dg - param.ftol * dg_init;
278
279 // Update step size and bracketing interval
280 Scalar new_step;
281 if (ft > fI_lo)
282 {
283 // Case 1: ft > fl
284 new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
285 // Sanity check: if the computed new_step is too small, typically due to
286 // extremely large value of ft, switch to the middle point
287 if (new_step <= param.min_step)
288 new_step = (I_lo + step) / Scalar(2);
289
290 I_hi = step;
291 fI_hi = ft;
292 gI_hi = gt;
293
294 // std::cout << "Case 1: new step = " << new_step << std::endl;
295 }
296 else if (gt * (I_lo - step) > Scalar(0))
297 {
298 // Case 2: ft <= fl, gt * (al - at) > 0
299 //
300 // Page 291 of Moré and Thuente (1994) suggests that
301 // newat = min(at + delta * (at - al), amax), delta in [1.1, 4]
302 new_step = std::min(step_max, step + delta * (step - I_lo));
303
304 // We can also consider the following scheme:
305 // First let step_selection() decide a value, and then project to the range above
306 //
307 // new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
308 // const Scalar delta2 = Scalar(4)
309 // const Scalar t1 = step + delta * (step - I_lo);
310 // const Scalar t2 = step + delta2 * (step - I_lo);
311 // const Scalar tl = std::min(t1, t2), tu = std::max(t1, t2);
312 // new_step = std::min(tu, std::max(tl, new_step));
313 // new_step = std::min(step_max, new_step);
314
315 I_lo = step;
316 fI_lo = ft;
317 gI_lo = gt;
318 // Move x and grad to x_lo and grad_lo, respectively
319 x_lo.swap(x);
320 grad_lo.swap(grad);
321 fx_lo = fx;
322 dg_lo = dg;
323
324 // std::cout << "Case 2: new step = " << new_step << std::endl;
325 }
326 else
327 {
328 // Case 3: ft <= fl, gt * (al - at) <= 0
329 new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
330
331 I_hi = I_lo;
332 fI_hi = fI_lo;
333 gI_hi = gI_lo;
334
335 I_lo = step;
336 fI_lo = ft;
337 gI_lo = gt;
338 // Move x and grad to x_lo and grad_lo, respectively
339 x_lo.swap(x);
340 grad_lo.swap(grad);
341 fx_lo = fx;
342 dg_lo = dg;
343
344 // std::cout << "Case 3: new step = " << new_step << std::endl;
345 }
346
347 // Case 1 and 3 are interpolations, whereas Case 2 is extrapolation
348 // This means that Case 2 may return new_step = step_max,
349 // and we need to decide whether to accept this value
350 // 1. If both step and new_step equal to step_max, it means
351 // step will have no further change, so we accept it
352 // 2. Otherwise, we need to test the function value and gradient
353 // on step_max, and decide later
354
355 // In case step, new_step, and step_max are equal, directly return the computed x and fx
356 if (step == step_max && new_step >= step_max)
357 {
358 // std::cout << "** Maximum step size reached\n\n";
359 // std::cout << "========================= Leaving line search =========================\n\n";
360
361 // Move {x, grad}_lo back before returning
362 x.swap(x_lo);
363 grad.swap(grad_lo);
364 return;
365 }
366 // Otherwise, recompute x and fx based on new_step
367 step = new_step;
368
369 if (step < param.min_step)
370 throw std::runtime_error("the line search step became smaller than the minimum value allowed");
371
372 if (step > param.max_step)
373 throw std::runtime_error("the line search step became larger than the maximum value allowed");
374
375 // Update parameter, function value, and gradient
376 x.noalias() = xp + step * drt;
377 fx = f(x, grad);
378 dg = grad.dot(drt);
379
380 // std::cout << "step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;
381
382 // Convergence test
383 if (fx <= fx_init + step * test_decr && abs(dg) <= test_curv)
384 {
385 // std::cout << "** Criteria met\n\n";
386 // std::cout << "========================= Leaving line search =========================\n\n";
387 return;
388 }
389
390 // Now assume step = step_max, and we need to decide whether to
391 // exit the line search (see the comments above regarding step_max)
392 // If we reach here, it means this step size does not pass the convergence
393 // test, so either the sufficient decrease condition or the curvature
394 // condition is not met yet
395 //
396 // Typically the curvature condition is harder to meet, and it is
397 // possible that no step size in [0, step_max] satisfies the condition
398 //
399 // But we need to make sure that its psi function value is smaller than
400 // the best one so far. If not, go to the next iteration and find a better one
401 if (step >= step_max)
402 {
403 const Scalar ft_bound = fx - fx_init - step * test_decr;
404 if (ft_bound <= fI_lo)
405 {
406 // std::cout << "** Maximum step size reached\n\n";
407 // std::cout << "========================= Leaving line search =========================\n\n";
408 return;
409 }
410 }
411 }
412
413 // If we have used up all line search iterations, then the strong Wolfe condition
414 // is not met. We choose not to raise an exception (unless no step satisfying
415 // sufficient decrease is found), but to return the best step size so far
416 if (iter >= param.max_linesearch)
417 {
418 // throw std::runtime_error("the line search routine reached the maximum number of iterations");
419
420 // First test whether the last step is better than I_lo
421 // If yes, return the last step
422 const Scalar ft = fx - fx_init - step * test_decr;
423 if (ft <= fI_lo)
424 return;
425
426 // If not, then the best step size so far is I_lo, but it needs to be positive
427 if (I_lo <= Scalar(0))
428 throw std::runtime_error("the line search routine is unable to sufficiently decrease the function value");
429
430 // Return everything with _lo
431 step = I_lo;
432 fx = fx_lo;
433 dg = dg_lo;
434 // Move {x, grad}_lo back
435 x.swap(x_lo);
436 grad.swap(grad_lo);
437 return;
438 }
439 }
440};
441
442} // namespace LBFGSpp
443
444#endif // LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
static void LineSearch(Foo &f, const SolverParam &param, const Vector &xp, const Vector &drt, const Scalar &step_max, Scalar &step, Scalar &fx, Vector &grad, Scalar &dg, Vector &x)