LBFGS++ 0.4.0
A header-only C++ library for L-BFGS and L-BFGS-B algorithms.
Loading...
Searching...
No Matches
LineSearchMoreThuente.h
1// Copyright (C) 2020-2026 Yixuan Qiu <yixuan.qiu@cos.name>
2// Under MIT license
3
4#ifndef LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
5#define LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
6
7#include <Eigen/Core>
8#include <stdexcept> // std::invalid_argument, std::runtime_error
9#include "Param.h"
10
11namespace LBFGSpp {
12
25template <typename Scalar>
27{
28private:
29 using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
30
31 // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
32 // that interpolates fa, ga, and fb, assuming the minimizer exists
33 // For case I: fb >= fa and ga * (b - a) < 0
34 static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& ga, const Scalar& fb)
35 {
36 const Scalar ba = b - a;
37 const Scalar w = Scalar(0.5) * ba * ga / (fa - fb + ba * ga);
38 return a + w * ba;
39 }
40
41 // Minimizer of a quadratic function q(x) = c0 + c1 * x + c2 * x^2
42 // that interpolates fa, ga and gb, assuming the minimizer exists
43 // The result actually does not depend on fa
44 // For case II: ga * (b - a) < 0, ga * gb < 0
45 // For case III: ga * (b - a) < 0, ga * ga >= 0, |gb| <= |ga|
46 static Scalar quadratic_minimizer(const Scalar& a, const Scalar& b, const Scalar& ga, const Scalar& gb)
47 {
48 const Scalar w = ga / (ga - gb);
49 return a + w * (b - a);
50 }
51
52 // Local minimizer of a cubic function q(x) = c0 + c1 * x + c2 * x^2 + c3 * x^3
53 // that interpolates fa, ga, fb and gb, assuming a != b
54 // Also sets a flag indicating whether the minimizer exists
55 static Scalar cubic_minimizer(const Scalar& a, const Scalar& b, const Scalar& fa, const Scalar& fb,
56 const Scalar& ga, const Scalar& gb, bool& exists)
57 {
58 using std::abs;
59 using std::sqrt;
60
61 const Scalar apb = a + b;
62 const Scalar ba = b - a;
63 const Scalar ba2 = ba * ba;
64 const Scalar fba = fb - fa;
65 const Scalar gba = gb - ga;
66 // z3 = c3 * (b-a)^3, z2 = c2 * (b-a)^3, z1 = c1 * (b-a)^3
67 const Scalar z3 = (ga + gb) * ba - Scalar(2) * fba;
68 const Scalar z2 = Scalar(0.5) * (gba * ba2 - Scalar(3) * apb * z3);
69 const Scalar z1 = fba * ba2 - apb * z2 - (a * apb + b * b) * z3;
70 // std::cout << "z1 = " << z1 << ", z2 = " << z2 << ", z3 = " << z3 << std::endl;
71
72 // If c3 = z/(b-a)^3 == 0, reduce to quadratic problem
73 const Scalar eps = std::numeric_limits<Scalar>::epsilon();
74 if (abs(z3) < eps * abs(z2) || abs(z3) < eps * abs(z1))
75 {
76 // Minimizer exists if c2 > 0
77 exists = (z2 * ba > Scalar(0));
78 // Return the end point if the minimizer does not exist
79 return exists ? (-Scalar(0.5) * z1 / z2) : b;
80 }
81
82 // Now we can assume z3 > 0
83 // The minimizer is a solution to the equation c1 + 2*c2 * x + 3*c3 * x^2 = 0
84 // roots = -(z2/z3) / 3 (+-) sqrt((z2/z3)^2 - 3 * (z1/z3)) / 3
85 //
86 // Let u = z2/(3z3) and v = z1/z2
87 // The minimizer exists if v/u <= 1
88 const Scalar u = z2 / (Scalar(3) * z3), v = z1 / z2;
89 const Scalar vu = v / u;
90 exists = (vu <= Scalar(1));
91 if (!exists)
92 return b;
93
94 // We need to find a numerically stable way to compute the roots, as z3 may still be small
95 //
96 // If |u| >= |v|, let w = 1 + sqrt(1-v/u), and then
97 // r1 = -u * w, r2 = -v / w, r1 does not need to be the smaller one
98 //
99 // If |u| < |v|, we must have uv <= 0, and then
100 // r = -u (+-) sqrt(delta), where
101 // sqrt(delta) = sqrt(|u|) * sqrt(|v|) * sqrt(1-u/v)
102 Scalar r1 = Scalar(0), r2 = Scalar(0);
103 if (abs(u) >= abs(v))
104 {
105 const Scalar w = Scalar(1) + sqrt(Scalar(1) - vu);
106 r1 = -u * w;
107 r2 = -v / w;
108 }
109 else
110 {
111 const Scalar sqrtd = sqrt(abs(u)) * sqrt(abs(v)) * sqrt(1 - u / v);
112 r1 = -u - sqrtd;
113 r2 = -u + sqrtd;
114 }
115 return (z3 * ba > Scalar(0)) ? ((std::max)(r1, r2)) : ((std::min)(r1, r2));
116 }
117
118 // Select the next step size according to the current step sizes,
119 // function values, and derivatives
120 static Scalar step_selection(
121 const Scalar& al, const Scalar& au, const Scalar& at,
122 const Scalar& fl, const Scalar& fu, const Scalar& ft,
123 const Scalar& gl, const Scalar& gu, const Scalar& gt)
124 {
125 using std::abs;
126
127 if (al == au)
128 return al;
129
130 // If ft = Inf or gt = Inf, we return the middle point of al and at
131 if (!std::isfinite(ft) || !std::isfinite(gt))
132 return (al + at) / Scalar(2);
133
134 // ac: cubic interpolation of fl, ft, gl, gt
135 // aq: quadratic interpolation of fl, gl, ft
136 bool ac_exists;
137 // std::cout << "al = " << al << ", at = " << at << ", fl = " << fl << ", ft = " << ft << ", gl = " << gl << ", gt = " << gt << std::endl;
138 const Scalar ac = cubic_minimizer(al, at, fl, ft, gl, gt, ac_exists);
139 const Scalar aq = quadratic_minimizer(al, at, fl, gl, ft);
140 // std::cout << "ac = " << ac << ", aq = " << aq << std::endl;
141 // Case 1: ft > fl
142 if (ft > fl)
143 {
144 // This should not happen if ft > fl, but just to be safe
145 if (!ac_exists)
146 return aq;
147 // Then use the scheme described in the paper
148 return (abs(ac - al) < abs(aq - al)) ? ac : ((aq + ac) / Scalar(2));
149 }
150
151 // as: quadratic interpolation of gl and gt
152 const Scalar as = quadratic_minimizer(al, at, gl, gt);
153 // Case 2: ft <= fl, gt * gl < 0
154 if (gt * gl < Scalar(0))
155 return (abs(ac - at) >= abs(as - at)) ? ac : as;
156
157 // Case 3: ft <= fl, gt * gl >= 0, |gt| < |gl|
158 const Scalar deltal = Scalar(1.1), deltau = Scalar(0.66);
159 if (abs(gt) < abs(gl))
160 {
161 // We choose either ac or as
162 // The case for ac: 1. It exists, and
163 // 2. ac is farther than at from al, and
164 // 3. ac is closer to at than as
165 // Cases for as: otherwise
166 const Scalar res = (ac_exists &&
167 (ac - at) * (at - al) > Scalar(0) &&
168 abs(ac - at) < abs(as - at)) ?
169 ac :
170 as;
171 // Postprocessing the chosen step
172 return (at > al) ?
173 (std::min)(at + deltau * (au - at), res) :
174 (std::max)(at + deltau * (au - at), res);
175 }
176
177 // Simple extrapolation if au, fu, or gu is infinity
178 if ((!std::isfinite(au)) || (!std::isfinite(fu)) || (!std::isfinite(gu)))
179 return at + deltal * (at - al);
180
181 // ae: cubic interpolation of ft, fu, gt, gu
182 bool ae_exists;
183 const Scalar ae = cubic_minimizer(at, au, ft, fu, gt, gu, ae_exists);
184 // Case 4: ft <= fl, gt * gl >= 0, |gt| >= |gl|
185 // The following is not used in the paper, but it seems to be a reasonable safeguard
186 return (at > al) ?
187 (std::min)(at + deltau * (au - at), ae) :
188 (std::max)(at + deltau * (au - at), ae);
189 }
190
191public:
213 template <typename Foo, typename SolverParam>
214 static void LineSearch(Foo& f, const SolverParam& param,
215 const Vector& xp, const Vector& drt, const Scalar& step_max,
216 Scalar& step, Scalar& fx, Vector& grad, Scalar& dg, Vector& x)
217 {
218 using std::abs;
219 // std::cout << "========================= Entering line search =========================\n\n";
220
221 // The strong Wolfe conditions are:
222 // f(x + alpha * d) <= f(x) + alpha * mu * g^T d,
223 // |g(x + alpha * d)^T d| <= eta * |g^T d|,
224 // where mu and eta are two constants, and we typically require 0 < mu < eta < 1.
225 //
226 // Let phi(alpha) = f(x + alpha * d), and then the conditions reduce to
227 // phi(alpha) <= phi(0) + mu * phi'(0) * alpha,
228 // |phi'(alpha)| <= eta * |phi'(0)|.
229 //
230 // [1] defines a set
231 // T(mu) = {alpha > 0: phi(alpha) <= phi(0) + mu * phi'(0) * alpha,
232 // |phi'(alpha)| <= mu * |phi'(0)| }.
233 // Clearly, if we have found some alpha that belongs to T(mu), and if mu < eta,
234 // then this alpha must satisfy the strong Wolfe conditions.
235 //
236 // In a practical implementation, we also impose bounds on alpha:
237 // alpha in Ia = [alpha_min, alpha_max]
238 // The lower bound alpha_min is used to terminate the iteration.
239 // The upper bound is typically used for constrained problems (e.g., L-BFGS-B).
240 //
241 // The overall framework of the algorithm is to generate a sequence of iterates alpha_k
242 // and a sequence of intervals I_k such that:
243 // 1. alpha_k is in intersect(I_k, Ia).
244 // 2. I_k eventually is a subset of Ia.
245 // 3. The length of I_k is shrinking.
246 // In each iteration, we test whether alpha_k satisfies the strong Wolfe conditions,
247 // and will exit the line search when it meets. Other possible exiting conditions are:
248 // 1. alpha reaches alpha_max.
249 // 2. alpha reaches alpha_min.
250 // 3. The maximum number of iterations is attained.
251 //
252 // To achieve this goal, we need to make sure that we can generate an interval I such that
253 // intersect(I, T(mu)) is not empty, and that for the updated interval I+, intersect(I+, T(mu))
254 // is also not empty. Theorem 2.1 of [1] gives conditions for this, which first defines
255 // an auxiliary function:
256 // psi(alpha) = phi(alpha) - phi(0) - mu * phi'(0) * alpha.
257 // Theorem 2.1 of [1] states that for an interval I = [alpha_l, alpha_u]
258 // (alpha_u can be smaller than alpha_l), if
259 // psi(alpha_l) <= psi(alpha_u), (*)
260 // psi(alpha_l) <= 0, (*)
261 // psi'(alpha_l) * (alpha_u - alpha_l) < 0, (*)
262 // then there is an alpha* in I such that
263 // psi(alpha*) <= psi(alpha_l),
264 // psi'(alpha*) = 0,
265 // and alpha* belongs to T(mu).
266 // In this case, alpha* satisfies the strong Wolfe conditions since mu < eta.
267 // This theorem motivates us to first locate an interval I that satisfies (*), and then
268 // refine I while selecting safeguarded trial values, driving the iterates towards a
269 // minimizer of psi(alpha) (or to a point in T(mu)).
270 //
271 // Note that the sufficient decrease condition is essentially psi(alpha) <= 0, and we have psi(0) = 0.
272 // Condition (*) means:
273 // 1. alpha_l has smaller psi value.
274 // 2. alpha_l satisfies the sufficient decrease condition.
275 // 3. alpha_u - alpha_l is a descent direction for psi, so that psi(alpha) < psi(alpha_l) for all
276 // alpha in I sufficiently close to alpha_l.
277 //
278 // We start with I_0 = [0, Inf] and alpha_0 in Ia. For brevity of notation,
279 // let current I_k be I = [I_lo, I_hi], and current alpha_k be step.
280 // In [1], the updated interval I+ = [I+_lo, I+_hi] is determined by the following rule:
281 // * Case I: If psi(step) > psi(I_lo), then I+_lo = I_lo, I+_hi = step.
282 // * Case II: If psi(step) <= psi(I_lo) and psi'(step)(I_lo - step) > 0, then I+_lo = step, I+_hi = I_hi.
283 // * Case III: If psi(step) <= psi(I_lo) and psi'(step)(I_lo - step) < 0, then I+_lo = step, I+_hi = I_lo.
284 //
285 // In Case I and Case III, I+ satisfies (*), and all subsequent intervals also satisfy,
286 // so for such cases we have bracketed some point alpha* in T(mu).
287 // When Case II persists, we repeat the process, and also enforce a safeguarding rule:
288 // step+ in [min{delta_max * step, alpha_max}, alpha_max],
289 // for some delta_max > 1. A concrete implementation that [1] uses is
290 // step+ = min{step + delta * (step - I_lo), alpha_max}
291 // for some 1.1 <= delta <= 4. Under this selection of step+, alpha_max will eventually be used as
292 // a trial value if the iteration does not go to Case I or Case III. Then we do a test:
293 // 1. If psi(alpha_max) <= 0 and psi'(alpha_max) < 0, then we choose to return alpha_max as the final
294 // line search step, although it may not satisfy the Wolfe conditions.
295 // 2. Otherwise, we continue the update process, and [1] shows that after a finite number of iterations,
296 // we will obtain an interval that satisfies (*).
297 // As a short summary, using this update rule, after a finite number of iterations we will either have
298 // an I_k that satisfies (*), or the line search stops at alpha_max (although in this case we may lose
299 // a better point that satisfies strong Wolfe conditions).
300 //
301 // Now assume that we already have an I that satisfies (*), and then the 3-cases update rule
302 // continues refining I, in the hope that I will finally be contained in Ia and gradually be shorter.
303 // But there is a possibility that alpha_k is decreasing with psi(alpha_k) > 0 or psi'(alpha_k) >= 0,
304 // but I_k never becomes a subset of Ia. To avoid such infinite loops, [1] enforces the safeguarding rule
305 // step+ in [alpha_min, max{delta_min * step, alpha_min}]
306 // for some delta_min < 1, as long as alpha_min > 0 and
307 // psi(alpha_k) > 0 or psi'(alpha_k) >= 0
308 // holds from the beginning. [1] shows that under this setting, after a finite number of iterations,
309 // we either:
310 // 1. obtain an interval I such that I is a subset of Ia, and I satisfies (*); or
311 // 2. we use alpha_min as a trial value, and psi(alpha_min) > 0 or psi'(alpha_min) >= 0.
312 // In the latter case, we choose to stop the line search at alpha_min.
313 //
314 // When we enter the phase that I is contained in Ia and I satisfies (*), we know that all subsequent
315 // intervals remain having these properties. Then our task is to shrink the length of the interval
316 // so that we can eventually find a point in I that satisfies the strong Wolfe condition. To make this
317 // happen, in this phase we impose the following safeguarding rule: if the length of I does not
318 // decrease by a factor delta < 1 after two trials, then a bisection step is used for the next iteration.
319 // [1] uses delta = 0.66.
320 //
321 // Now the remaining problem is how to select the next iterate step+ after we obtain the updated
322 // interval I+. In [1], it is based on information of (f_lo, f_hi, f_t) and (g_lo, g_hi, g_t),
323 // where f_lo = f(I_lo), f_hi = f(I_hi), f_t = f(step), g_lo = f'(I_lo), g_hi = f'(I_hi), and g_t = f'(step).
324 // f is initially set to psi, and if for some step we have psi(step) <= 0 and phi'(step) >= 0,
325 // we set f to phi in subsequent iterations.
326 //
327 // First define:
328 // 1. ac to be the minimizer of the cubic function that interpolates f_lo, f_t, g_lo, and g_t;
329 // 2. aq to be the minimizer of the quadratic function that interpolates f_lo, f_t, and g_lo;
330 // 3. as to be the minimizer of the quadratic function that interpolates f_lo, g_lo, and g_t.
331 // Then the selection of step+ follows a 4-cases rule:
332 // 1. f_t > f_lo:
333 // step+ = { ac, if |ac - I_lo| < |aq - I_lo|,
334 // { (aq + ac) / 2}, otherwise.
335 // 2. f_t <= f_lo and g_t * g_lo < 0:
336 // step+ = { ac, if |ac - step| >= |as - step|,
337 // { as, otherwise.
338 // 3. f_t <= f_lo, g_t * g_lo >= 0, and |g_t| <= |g_l|:
339 // If ac exists and is at the correct direction, i.e., (ac - step) * (step - I_lo) > 0,
340 // step+ = { ac, if |ac - step| < |as - step|,
341 // { as, otherwise.
342 // otherwise,
343 // step+ = as.
344 // Safeguarding:
345 // step+ = { min{step + delta * (I_hi - step), step+}, if step > I_lo,
346 // { max{step + delta * (I_hi - step), step+}, otherwise,
347 // where delta < 1. [1] takes delta = 0.66.
348 // 4. f_t <= f_lo, g_t * g_lo >= 0, and |g_t| > |g_l|:
349 // step+ is the minimizer of the cubic function that interpolates f_hi, f_t, g_hi, and g_t.
350 //
351 // Implementation note: the step values computed using the method above are just candidates, and
352 // we must ensure that the safeguarding rules mentioned earlier are respected:
353 // 1. In Case II, we set step+ = min{step + delta * (step - I_lo), alpha_max}.
354 // 2. If alpha_min > 0 and psi(alpha_k) > 0 or psi'(alpha_k) >= 0 holds from the beginning, we clamp
355 // step+ to [alpha_min, max{delta_min * step, alpha_min}].
356 // 3. When we enter the phase that I is contained in Ia and I satisfies (*), a bisection step is used
357 // if the length of I does not decrease by a factor delta < 1 after two trials.
358
359 // Check the value of step
360 const Scalar step_min = param.min_step;
361 if (step <= Scalar(0))
362 throw std::invalid_argument("'step' must be positive");
363 if (step < step_min)
364 throw std::invalid_argument("'step' is smaller than 'param.min_step'");
365 if (step > step_max)
366 throw std::invalid_argument("'step' exceeds 'step_max'");
367
368 // Save the function value at the current x
369 const Scalar fx_init = fx;
370 // Projection of gradient on the search direction
371 const Scalar dg_init = dg;
372
373 // std::cout << "fx_init = " << fx_init << ", dg_init = " << dg_init << std::endl << std::endl;
374
375 // Make sure d points to a descent direction
376 if (dg_init >= Scalar(0))
377 throw std::logic_error("the moving direction does not decrease the objective function value");
378
379 // Tolerance for convergence test
380 // Sufficient decrease
381 const Scalar test_decr = param.ftol * dg_init;
382 // Curvature
383 const Scalar test_curv = -param.wolfe * dg_init;
384
385 // The bracketing interval
386 const Scalar Inf = std::numeric_limits<Scalar>::infinity();
387 Scalar I_lo = Scalar(0), I_hi = Inf;
388 Scalar fI_lo = Scalar(0), fI_hi = Inf;
389 Scalar gI_lo = (Scalar(1) - param.ftol) * dg_init, gI_hi = Inf;
390 Scalar psiI_lo = fI_lo;
391 // We also need to save x and grad for step=I_lo, since we want to return the best
392 // step size along the path when strong Wolfe condition is not met
393 Vector x_lo = xp, grad_lo = grad;
394 Scalar fx_lo = fx_init, dg_lo = dg_init;
395
396 // Status variables
397 bool bracketed = false;
398 bool f_is_psi = true;
399 bool use_step_min_safeguard = (step_min > Scalar(0));
400 Scalar I_width = Inf;
401 Scalar I_width_prev = Inf;
402 int I_shrink_fail_count = 0;
403
404 // Constants
405 const Scalar delta_max = Scalar(1.1);
406 const Scalar delta_min = Scalar(7) / Scalar(12);
407 const Scalar shrink = Scalar(0.66);
408 int iter;
409 for (iter = 0; iter < param.max_linesearch; iter++)
410 {
411 // Function value and gradient at the current step size
412 x.noalias() = xp + step * drt;
413 fx = f(x, grad);
414 dg = grad.dot(drt);
415
416 // std::cout << "[min_tep, max_step] = [" << step_min << ", " << step_max << "], step = " << step << ", fx = " << fx << ", dg = " << dg << std::endl;
417
418 // phi(step) = f(xp + step * drt) = fx
419 // phi'(step) = g(xp + step * drt)^T d = dg
420 // psi(step) = f(xp + step * drt) - f(xp) - step * test_decr
421 // psi'(step) = dg - test_decr
422 const Scalar psit = fx - fx_init - step * test_decr;
423 const Scalar dpsit = dg - test_decr;
424
425 // std::cout << "psi(step) = " << psit << ", phi'(step) = " << dpsit << std::endl;
426
427 // Convergence test
428 if (psit <= Scalar(0) && abs(dg) <= test_curv)
429 {
430 // std::cout << "** Criteria met\n\n";
431 // std::cout << "========================= Leaving line search =========================\n\n";
432 return;
433 }
434
435 // Test whether step hits the boundaries and satisfies the exit conditions
436 if (step <= step_min && (psit > Scalar(0) || dpsit >= Scalar(0)))
437 {
438 // std::cout << "** Exits at step_min\n\n";
439 // std::cout << "========================= Leaving line search =========================\n\n";
440 return;
441 }
442 if (step >= step_max && (psit <= Scalar(0) && dpsit < Scalar(0)))
443 {
444 // std::cout << "** Exits at step_max\n\n";
445 // std::cout << "========================= Leaving line search =========================\n\n";
446 return;
447 }
448
449 // Check and update the status of f_is_psi
450 // f is initially set to psi, and is changed to phi in
451 // subsequent iterations if psi(step) <= 0 and phi'(step) >= 0
452 //
453 // NOTE: empirically we find that using psi is usually better,
454 // so for now we do not follow the implementation of [1]
455 /*
456 if (f_is_psi && (psit <= Scalar(0) && dg >= Scalar(0)))
457 {
458 f_is_psi = false;
459 }
460 */
461 const Scalar ft = f_is_psi ? psit : fx;
462 const Scalar gt = f_is_psi ? dpsit : dg;
463
464 // Check and update the status of use_step_min_safeguard
465 // We impose a safeguarding rule that guarantees testing
466 // step_min if psi(alpha_k) > 0 or psi'(alpha_k) >= 0
467 // holds from the beginning
468 if (use_step_min_safeguard && (psit <= Scalar(0) && dpsit < Scalar(0)))
469 {
470 use_step_min_safeguard = false;
471 }
472
473 // Update new step
474 Scalar new_step;
475 const bool in_case_2 = (psit <= psiI_lo) && (dpsit * (I_lo - step) > Scalar(0));
476 if (in_case_2)
477 {
478 // For Case 2, we apply the safeguarding rule
479 // newat = min(at + delta * (at - al), amax), delta in [1.1, 4]
480 new_step = (std::min)(step_max, step + delta_max * (step - I_lo));
481
482 // We can also consider the following scheme:
483 // First let step_selection() decide a value, and then project to the range above
484 //
485 // new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
486 // const Scalar delta2 = Scalar(4)
487 // const Scalar t1 = step + delta * (step - I_lo);
488 // const Scalar t2 = step + delta2 * (step - I_lo);
489 // const Scalar tl = std::min(t1, t2), tu = std::max(t1, t2);
490 // new_step = std::min(tu, std::max(tl, new_step));
491 // new_step = std::min(step_max, new_step);
492 }
493 else
494 {
495 // For Case 1 and Case 3, use information of f and g to select new step
496 new_step = step_selection(I_lo, I_hi, step, fI_lo, fI_hi, ft, gI_lo, gI_hi, gt);
497 // Force new step in [step_min, step_max]
498 new_step = (std::max)(new_step, step_min);
499 new_step = (std::min)(new_step, step_max);
500
501 // Apply safeguarding rule related to step_min when necessary:
502 // step+ in [alpha_min, max{delta_min * step, alpha_min}]
503 //
504 // If use_step_min_safeguard = true, then new_step cannot be obtained
505 // from Case 2, since in Case 2 we have
506 // psi(alpha_k) <= 0 and psi'(alpha_k) < 0
507 if (use_step_min_safeguard)
508 {
509 const Scalar lower = step_min;
510 const Scalar upper = (std::max)(step_min, delta_min * step);
511 new_step = (std::max)(new_step, lower);
512 new_step = (std::min)(new_step, upper);
513 }
514 }
515
516 // Update bracketing interval
517 if (psit > psiI_lo)
518 {
519 // Case 1: psi(step) > psi(I_lo)
520 I_hi = step;
521 fI_hi = ft;
522 gI_hi = gt;
523
524 // std::cout << "** Case 1" << std::endl;
525 }
526 else if (in_case_2)
527 {
528 // Case 2: psi(step) <= psi(I_lo), psi'(step)(I_lo - step) > 0
529 I_lo = step;
530 fI_lo = ft;
531 gI_lo = gt;
532 psiI_lo = psit;
533 // Move x and grad to x_lo and grad_lo, respectively
534 x_lo.swap(x);
535 grad_lo.swap(grad);
536 fx_lo = fx;
537 dg_lo = dg;
538
539 // std::cout << "** Case 2" << std::endl;
540 }
541 else
542 {
543 // Case 3: psi(step) <= psi(I_lo), psi'(step)(I_lo - step) <= 0
544 I_hi = I_lo;
545 fI_hi = fI_lo;
546 gI_hi = gI_lo;
547
548 I_lo = step;
549 fI_lo = ft;
550 gI_lo = gt;
551 psiI_lo = psit;
552 // Move x and grad to x_lo and grad_lo, respectively
553 x_lo.swap(x);
554 grad_lo.swap(grad);
555 fx_lo = fx;
556 dg_lo = dg;
557
558 // std::cout << "** Case 3" << std::endl;
559 }
560
561 // Check and update the status of bracketed
562 // bracketed is true if we have entered Case 1 or Case 3,
563 // and I is contained in [step_min, step_max]
564 if ((!bracketed) && (!in_case_2))
565 {
566 const Scalar I_left = (std::min)(I_lo, I_hi);
567 const Scalar I_right = (std::max)(I_lo, I_hi);
568 bracketed = (I_left >= step_min && I_right <= step_max);
569 }
570
571 // If bracketed, enforce sufficient interval shrink; if not shrinking enough, use bisection
572 if (bracketed)
573 {
574 I_width_prev = I_width;
575 I_width = abs(I_hi - I_lo);
576 // Test interval shrinkage
577 if (I_width_prev < Inf && I_width > shrink * I_width_prev)
578 {
579 I_shrink_fail_count += 1;
580 }
581 else
582 {
583 I_shrink_fail_count = 0;
584 }
585 // If interval fails to shrink enough twice, select new_step using bisection
586 if (I_shrink_fail_count >= 2)
587 {
588 new_step = (I_lo + I_hi) / Scalar(2);
589 I_shrink_fail_count = 0;
590 }
591 }
592
593 // Set the new_step
594 step = new_step;
595
596 // std::cout << "[I+_lo, I+_hi] = [" << I_lo << ", " << I_hi << "], step+ = " << step << std::endl << std::endl;
597 }
598
599 // If we have used up all line search iterations, then the strong Wolfe condition
600 // is not met. We choose not to raise an exception, but to return the best
601 // step size so far
602 if (iter >= param.max_linesearch)
603 {
604 std::cout << "** Maximum step size reached\n\n";
605 // std::cout << "** Maximum step size reached\n\n";
606 // std::cout << "========================= Leaving line search =========================\n\n";
607
608 // Return everything with _lo
609 step = I_lo;
610 fx = fx_lo;
611 dg = dg_lo;
612 // Move {x, grad}_lo back
613 x.swap(x_lo);
614 grad.swap(grad_lo);
615 }
616 }
617};
618
619} // namespace LBFGSpp
620
621#endif // LBFGSPP_LINE_SEARCH_MORE_THUENTE_H
static void LineSearch(Foo &f, const SolverParam &param, const Vector &xp, const Vector &drt, const Scalar &step_max, Scalar &step, Scalar &fx, Vector &grad, Scalar &dg, Vector &x)