SHOGUN  6.1.3
lbfgs.cpp
Go to the documentation of this file.
1 /*
2  * Limited memory BFGS (L-BFGS).
3  *
4  * Copyright (c) 1990, Jorge Nocedal
5  * Copyright (c) 2007-2010 Naoaki Okazaki
6  * All rights reserved.
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining a copy
9  * of this software and associated documentation files (the "Software"), to deal
10  * in the Software without restriction, including without limitation the rights
11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12  * copies of the Software, and to permit persons to whom the Software is
13  * furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included in
16  * all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24  * THE SOFTWARE.
25  */
26 
27 /* $Id$ */
28 
29 /*
30 This library is a C port of the FORTRAN implementation of Limited-memory
31 Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
32 The original FORTRAN source code is available at:
33 http://www.ece.northwestern.edu/~nocedal/lbfgs.html
34 
35 The L-BFGS algorithm is described in:
36  - Jorge Nocedal.
37  Updating Quasi-Newton Matrices with Limited Storage.
38  <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
39  - Dong C. Liu and Jorge Nocedal.
40  On the limited memory BFGS method for large scale optimization.
41  <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
42 
43 The line search algorithms used in this implementation are described in:
44  - John E. Dennis and Robert B. Schnabel.
45  <i>Numerical Methods for Unconstrained Optimization and Nonlinear
46  Equations</i>, Englewood Cliffs, 1983.
47  - Jorge J. More and David J. Thuente.
48  Line search algorithm with guaranteed sufficient decrease.
49  <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
50  pp. 286-307, 1994.
51 
52 This library also implements Orthant-Wise Limited-memory Quasi-Newton (OWL-QN)
53 method presented in:
54  - Galen Andrew and Jianfeng Gao.
55  Scalable training of L1-regularized log-linear models.
56  In <i>Proceedings of the 24th International Conference on Machine
57  Learning (ICML 2007)</i>, pp. 33-40, 2007.
58 
59 I would like to thank the original author, Jorge Nocedal, who has been
60 distributing the effieicnt and explanatory implementation in an open source
61 licence.
62 */
63 
64 #include <algorithm>
65 #include <cstdio>
66 #include <cmath>
67 #include <string.h>
68 #include <vector>
69 
71 #include <shogun/lib/SGVector.h>
72 #include <shogun/lib/common.h>
73 #include <shogun/lib/memory.h>
76 
77 namespace shogun
78 {
79 
80 #define min2(a, b) ((a) <= (b) ? (a) : (b))
81 #define max2(a, b) ((a) >= (b) ? (a) : (b))
82 #define max3(a, b, c) max2(max2((a), (b)), (c));
83 
85  int32_t n;
86  void *instance;
90 };
92 
97  float64_t ys; /* vecdot(y, s) */
98 };
100 
101 static const lbfgs_parameter_t _defparam = {
102  6, 1e-5, 0, 0.0,
104  1e-20, 1e20, 1e-4, 0.9, 0.9, 1.0e-16,
105  0.0, 0, 1,
106 };
107 
108 /* Forward function declarations. */
109 
110 typedef int32_t (*line_search_proc)(
111  int32_t n,
112  float64_t *x,
113  float64_t *f,
116  float64_t *stp,
117  const SGVector<float64_t>& xp,
118  const SGVector<float64_t>& gp,
120  callback_data_t *cd,
121  const lbfgs_parameter_t *param
122  );
123 
124 static int32_t line_search_backtracking(
125  int32_t n,
126  float64_t *x,
127  float64_t *f,
130  float64_t *stp,
131  const SGVector<float64_t>& xp,
132  const SGVector<float64_t>& gp,
134  callback_data_t *cd,
135  const lbfgs_parameter_t *param
136  );
137 
138 static int32_t line_search_backtracking_owlqn(
139  int32_t n,
140  float64_t *x,
141  float64_t *f,
144  float64_t *stp,
145  const SGVector<float64_t>& xp,
146  const SGVector<float64_t>& gp,
148  callback_data_t *cd,
149  const lbfgs_parameter_t *param
150  );
151 
152 static int32_t line_search_morethuente(
153  int32_t n,
154  float64_t *x,
155  float64_t *f,
158  float64_t *stp,
159  const SGVector<float64_t>& xp,
160  const SGVector<float64_t>& gp,
162  callback_data_t *cd,
163  const lbfgs_parameter_t *param
164  );
165 
166 static int32_t update_trial_interval(
167  float64_t *x,
168  float64_t *fx,
169  float64_t *dx,
170  float64_t *y,
171  float64_t *fy,
172  float64_t *dy,
173  float64_t *t,
174  float64_t *ft,
175  float64_t *dt,
176  const float64_t tmin,
177  const float64_t tmax,
178  int32_t *brackt
179  );
180 
181 static float64_t owlqn_x1norm(
182  const float64_t* x,
183  const int32_t start,
184  const int32_t n
185  );
186 
187 static void owlqn_pseudo_gradient(
188  float64_t* pg,
189  const float64_t* x,
190  const float64_t* g,
191  const int32_t n,
192  const float64_t c,
193  const int32_t start,
194  const int32_t end
195  );
196 
197 static void owlqn_project(
198  float64_t* d,
199  const float64_t* sign,
200  const int32_t start,
201  const int32_t end
202  );
203 
204 
206 {
207  sg_memcpy(param, &_defparam, sizeof(*param));
208 }
209 
210 int32_t lbfgs(
211  int32_t n,
212  float64_t *x,
213  float64_t *ptr_fx,
216  void *instance,
217  lbfgs_parameter_t *_param,
219  )
220 {
221  int32_t ret;
222  int32_t i, j, k, ls, end, bound;
223  float64_t step;
224 
225  /* Constant parameters and their default values. */
226  lbfgs_parameter_t param = (_param != NULL) ? (*_param) : _defparam;
227  const int32_t m = param.m;
228 
229  std::vector<iteration_data_t> lm;
230  std::vector<iteration_data_t>::iterator it;
231  SGVector<float64_t> x_wrap(x, n, false);
232  float64_t ys, yy;
233  float64_t xnorm, gnorm, beta;
234  float64_t fx = 0.;
235  float64_t rate = 0.;
237 
238  /* Construct a callback data. */
239  callback_data_t cd;
240  cd.n = n;
241  cd.instance = instance;
245 
246  /* Check the input parameters for errors. */
247  if (n <= 0) {
248  return LBFGSERR_INVALID_N;
249  }
250  if (param.epsilon < 0.) {
252  }
253  if (param.past < 0) {
255  }
256  if (param.delta < 0.) {
257  return LBFGSERR_INVALID_DELTA;
258  }
259  if (param.min_step < 0.) {
261  }
262  if (param.max_step < param.min_step) {
264  }
265  if (param.ftol < 0.) {
266  return LBFGSERR_INVALID_FTOL;
267  }
270  if (param.wolfe <= param.ftol || 1. <= param.wolfe) {
271  return LBFGSERR_INVALID_WOLFE;
272  }
273  }
274  if (param.gtol < 0.) {
275  return LBFGSERR_INVALID_GTOL;
276  }
277  if (param.xtol < 0.) {
278  return LBFGSERR_INVALID_XTOL;
279  }
280  if (param.max_linesearch <= 0) {
282  }
283  if (param.orthantwise_c < 0.) {
285  }
286  if (param.orthantwise_start < 0 || n < param.orthantwise_start) {
288  }
289  if (param.orthantwise_end < 0) {
290  param.orthantwise_end = n;
291  }
292  if (n < param.orthantwise_end) {
294  }
295  if (param.orthantwise_c != 0.) {
296  switch (param.linesearch) {
298  linesearch = line_search_backtracking_owlqn;
299  break;
300  default:
301  /* Only the backtracking method is available. */
303  }
304  } else {
305  switch (param.linesearch) {
307  linesearch = line_search_morethuente;
308  break;
312  linesearch = line_search_backtracking;
313  break;
314  default:
316  }
317  }
318 
319  /* Allocate working space. */
320  SGVector<float64_t> xp, g, gp, d, w, pg, pf;
321  try
322  {
323  xp = SGVector<float64_t>(n);
324  g = SGVector<float64_t>(n);
325  gp = SGVector<float64_t>(n);
326  d = SGVector<float64_t>(n);
327  w = SGVector<float64_t>(n);
328 
329  if (param.orthantwise_c != 0.) {
330  /* Allocate working space for OW-LQN. */
331  pg = SGVector<float64_t>(n);
332  }
333 
334  /* Allocate limited memory storage. */
335  lm.resize(m);
336 
337  /* Initialize the limited memory. */
338  for (auto& e: lm)
339  {
340  e.alpha = 0;
341  e.ys = 0;
342  e.s = SGVector<float64_t>(n);
343  e.y = SGVector<float64_t>(n);
344  }
345 
346  /* Allocate an array for storing previous values of the objective function. */
347  if (0 < param.past)
348  pf = SGVector<float64_t>(param.past);
349  }
350  catch (const ShogunException& e)
351  {
352  ret = LBFGSERR_OUTOFMEMORY;
353  goto lbfgs_exit;
354  }
355 
356  /* Evaluate the function value and its gradient. */
357  fx = cd.proc_evaluate(cd.instance, x, g.vector, cd.n, 0);
358  if (0. != param.orthantwise_c) {
359  /* Compute the L1 norm of the variable and add it to the object value. */
360  xnorm = owlqn_x1norm(x, param.orthantwise_start, param.orthantwise_end);
361  fx += xnorm * param.orthantwise_c;
363  pg.vector, x, g.vector, n,
365  );
366  }
367 
368  /* Store the initial value of the objective function. */
369  if (pf != NULL) {
370  pf[0] = fx;
371  }
372 
373  /*
374  Compute the direction;
375  we assume the initial hessian matrix H_0 as the identity matrix.
376  */
377  if (param.orthantwise_c == 0.) {
378  sg_memcpy(d.vector, g.vector, n*sizeof(float64_t));
379  linalg::scale(d, d, -1.0);
380  } else {
381  sg_memcpy(d.vector, pg.vector, n*sizeof(float64_t));
382  linalg::scale(d, d, -1.0);
383  }
384 
385  /*
386  Make sure that the initial variables are not a minimizer.
387  */
388  xnorm = CMath::sqrt(linalg::dot(x_wrap, x_wrap));
389  if (param.orthantwise_c == 0.) {
390  gnorm = CMath::sqrt(linalg::dot(g, g));
391  } else {
392  gnorm = CMath::sqrt(linalg::dot(pg, pg));
393  }
394  if (xnorm < 1.0) xnorm = 1.0;
395  if (gnorm / xnorm <= param.epsilon) {
397  goto lbfgs_exit;
398  }
399 
400  /* Compute the initial step:
401  step = 1.0 / sqrt(vecdot(d, d, n))
402  */
403  step = 1.0 / CMath::sqrt(linalg::dot(d, d));
404 
405  k = 1;
406  end = 0;
407  for (;;) {
408  /* Store the current position and gradient vectors. */
409  sg_memcpy(xp.vector, x, n*sizeof(float64_t));
410  sg_memcpy(gp.vector, g.vector, n*sizeof(float64_t));
411 
412  /* Search for an optimal step. */
413  if (param.orthantwise_c == 0.) {
414  ls = linesearch(n, x, &fx, g, d, &step, xp, gp, w, &cd, &param);
415  } else {
416  ls = linesearch(n, x, &fx, g, d, &step, xp, pg, w, &cd, &param);
418  pg.vector, x, g.vector, n,
420  );
421  }
422  if (ls < 0) {
423  /* Revert to the previous point. */
424  sg_memcpy(x, xp.vector, n*sizeof(float64_t));
425  sg_memcpy(g.vector, gp.vector, n*sizeof(float64_t));
426  ret = ls;
427 
428  /* Roll back */
429  if (ls==LBFGSERR_INVALID_VALUE)
430  fx = cd.proc_evaluate(cd.instance, x, g.vector, cd.n, step);
431 
432  goto lbfgs_exit;
433  }
434 
435  /* Compute x and g norms. */
436  xnorm = CMath::sqrt(linalg::dot(x_wrap, x_wrap));
437  if (param.orthantwise_c == 0.) {
438  gnorm = CMath::sqrt(linalg::dot(g, g));
439  } else {
440  gnorm = CMath::sqrt(linalg::dot(pg, pg));
441  }
442 
443  /* Report the progress. */
444  if (cd.proc_progress) {
445  if ((ret = cd.proc_progress(cd.instance, x, g, fx, xnorm, gnorm, step, cd.n, k, ls))) {
446  goto lbfgs_exit;
447  }
448  }
449 
450  /*
451  Convergence test.
452  The criterion is given by the following formula:
453  |g(x)| / \max2(1, |x|) < \epsilon
454  */
455  if (xnorm < 1.0) xnorm = 1.0;
456  if (gnorm / xnorm <= param.epsilon) {
457  /* Convergence. */
458  ret = LBFGS_SUCCESS;
459  break;
460  }
461 
462  /*
463  Test for stopping criterion.
464  The criterion is given by the following formula:
465  (f(past_x) - f(x)) / f(x) < \delta
466  */
467  if (pf != NULL) {
468  /* We don't test the stopping criterion while k < past. */
469  if (param.past <= k) {
470  /* Compute the relative improvement from the past. */
471  rate = (pf[k % param.past] - fx) / fx;
472 
473  /* The stopping criterion. */
474  if (rate < param.delta) {
475  ret = LBFGS_STOP;
476  break;
477  }
478  }
479 
480  /* Store the current value of the objective function. */
481  pf[k % param.past] = fx;
482  }
483 
484  if (param.max_iterations != 0 && param.max_iterations < k+1) {
485  /* Maximum number of iterations. */
487  break;
488  }
489 
490  /*
491  Update vectors s and y:
492  s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
493  y_{k+1} = g_{k+1} - g_{k}.
494  */
495  it = std::next(lm.begin(), end);
496  linalg::add(x_wrap, xp, it->s, 1.0, -1.0);
497  linalg::add(g, gp, it->y, 1.0, -1.0);
498 
499  /*
500  Compute scalars ys and yy:
501  ys = y^t \cdot s = 1 / \rho.
502  yy = y^t \cdot y.
503  Notice that yy is used for scaling the hessian matrix H_0 (Cholesky factor).
504  */
505  ys = linalg::dot(it->y, it->s);
506  yy = linalg::dot(it->y, it->y);
507  it->ys = ys;
508 
509  /*
510  Recursive formula to compute dir = -(H \cdot g).
511  This is described in page 779 of:
512  Jorge Nocedal.
513  Updating Quasi-Newton Matrices with Limited Storage.
514  Mathematics of Computation, Vol. 35, No. 151,
515  pp. 773--782, 1980.
516  */
517  bound = (m <= k) ? m : k;
518  ++k;
519  end = (end + 1) % m;
520 
521  /* Compute the steepest direction. */
522  if (param.orthantwise_c == 0.) {
523  /* Compute the negative of gradients. */
524  sg_memcpy(d.vector, g.vector, n*sizeof(float64_t));
525  linalg::scale(d, d, -1.0);
526  } else {
527  sg_memcpy(d.vector, pg.vector, n*sizeof(float64_t));
528  linalg::scale(d, d, -1.0);
529  }
530 
531  j = end;
532  for (i = 0;i < bound;++i) {
533  j = (j + m - 1) % m; /* if (--j == -1) j = m-1; */
534  it = std::next(lm.begin(), j);
535  /* \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}. */
536  it->alpha = linalg::dot(it->s, d) / it->ys;
537  /* q_{i} = q_{i+1} - \alpha_{i} y_{i}. */
538  linalg::add(d, it->y, d, 1.0, -(it->alpha));
539  }
540 
541  linalg::scale(d, d, ys/yy);
542  for (i = 0;i < bound;++i) {
543  it = std::next(lm.begin(), j);
544  /* \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}. */
545  beta = linalg::dot(it->y, d) / it->ys;
546  /* \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}. */
547  linalg::add(d, it->s, d, 1.0, it->alpha-beta);
548  j = (j + 1) % m; /* if (++j == m) j = 0; */
549  }
550 
551  /*
552  Constrain the search direction for orthant-wise updates.
553  */
554  if (param.orthantwise_c != 0.) {
555  for (i = param.orthantwise_start;i < param.orthantwise_end;++i) {
556  if (d[i] * pg[i] >= 0) {
557  d[i] = 0;
558  }
559  }
560  }
561 
562  /*
563  Now the search direction d is ready. We try step = 1 first.
564  */
565  step = 1.0;
566  }
567 
568 lbfgs_exit:
569  /* Return the final value of the objective function. */
570  if (ptr_fx != NULL) {
571  *ptr_fx = fx;
572  }
573 
574  /* Free memory blocks used by this function. */
575  lm.clear();
576 
577  return ret;
578 }
579 
580 
581 
583  int32_t n,
584  float64_t *x,
585  float64_t *f,
588  float64_t *stp,
589  const SGVector<float64_t>& xp,
590  const SGVector<float64_t>& gp,
592  callback_data_t *cd,
593  const lbfgs_parameter_t *param
594  )
595 {
596  int32_t count = 0;
597  float64_t width, dg;
598  float64_t finit, dginit = 0., dgtest;
599  const float64_t dec = 0.5, inc = 2.1;
600 
601  /* Check the input parameters for errors. */
602  if (*stp <= 0.) {
604  }
605 
606  /* Compute the initial gradient in the search direction. */
607  dginit = linalg::dot(g, s);
608 
609  /* Make sure that s points to a descent direction. */
610  if (0 < dginit) {
612  }
613 
614  /* The initial value of the objective function. */
615  finit = *f;
616  dgtest = param->ftol * dginit;
617  const index_t max_iter = 20;
618 
619  for (;;) {
620  sg_memcpy(x, xp.vector, n*sizeof(float64_t));
621  if (cd->proc_adjust_step)
622  *stp=cd->proc_adjust_step(cd->instance, x, s.vector, cd->n, *stp);
623 
624  for(index_t j=0; j<n; j++)
625  {
626  if (CMath::is_nan(s[j]) || CMath::is_infinity(s[j]))
627  return LBFGSERR_INVALID_VALUE;
628  }
629 
630  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
631  float64_t decay=0.5;
632  index_t iter=0;
633 
634  while(true)
635  {
636  /* Evaluate the function and gradient values. */
637  *f = cd->proc_evaluate(cd->instance, x, g.vector, cd->n, *stp);
638  ++count;
639  if (CMath::is_nan(*f) || CMath::is_infinity(*f))
640  *stp*=decay;
641  else
642  break;
643  SGVector<float64_t>::add(x, 1, x, -1.0*(*stp), s, n);
644  iter++;
645  if (iter>max_iter)
646  return LBFGSERR_INVALID_VALUE;
647  }
648 
649 
650  if (*f > finit + *stp * dgtest) {
651  width = dec;
652  } else {
653  /* The sufficient decrease condition (Armijo condition). */
655  /* Exit with the Armijo condition. */
656  return count;
657  }
658 
659  /* Check the Wolfe condition. */
660  dg = linalg::dot(g, s);
661  if (dg < param->wolfe * dginit) {
662  width = inc;
663  } else {
665  /* Exit with the regular Wolfe condition. */
666  return count;
667  }
668 
669  /* Check the strong Wolfe condition. */
670  if(dg > -param->wolfe * dginit) {
671  width = dec;
672  } else {
673  /* Exit with the strong Wolfe condition. */
674  return count;
675  }
676  }
677  }
678 
679  if (*stp < param->min_step) {
680  /* The step is the minimum value. */
681  return LBFGSERR_MINIMUMSTEP;
682  }
683  if (*stp > param->max_step) {
684  /* The step is the maximum value. */
685  return LBFGSERR_MAXIMUMSTEP;
686  }
687  if (param->max_linesearch <= count) {
688  /* Maximum number of iteration. */
690  }
691 
692  (*stp) *= width;
693  }
694 }
695 
696 
697 
699  int32_t n,
700  float64_t *x,
701  float64_t *f,
704  float64_t *stp,
705  const SGVector<float64_t>& xp,
706  const SGVector<float64_t>& gp,
708  callback_data_t *cd,
709  const lbfgs_parameter_t *param
710  )
711 {
712  int32_t i, count = 0;
713  float64_t width = 0.5, norm = 0.;
714  float64_t finit = *f, dgtest;
715 
716  /* Check the input parameters for errors. */
717  if (*stp <= 0.) {
719  }
720 
721  /* Choose the orthant for the new point. */
722  for (i = 0;i < n;++i) {
723  wp[i] = (xp[i] == 0.) ? -gp[i] : xp[i];
724  }
725 
726  SGVector<float64_t> x_wrap(x, n, false);
727  for (;;) {
728  /* Update the current point. */
729  sg_memcpy(x, xp.vector, n*sizeof(float64_t));
730  if (cd->proc_adjust_step)
731  *stp=cd->proc_adjust_step(cd->instance, x, s.vector, cd->n, *stp);
732  linalg::add(x_wrap, s, x_wrap, 1.0, *stp);
733 
734  /* The current point is projected onto the orthant. */
735  owlqn_project(x, wp.vector, param->orthantwise_start, param->orthantwise_end);
736 
737  /* Evaluate the function and gradient values. */
738  *f = cd->proc_evaluate(cd->instance, x, g.vector, cd->n, *stp);
739 
740  /* Compute the L1 norm of the variables and add it to the object value. */
741  norm = owlqn_x1norm(x, param->orthantwise_start, param->orthantwise_end);
742  *f += norm * param->orthantwise_c;
743 
744  ++count;
745 
746  dgtest = 0.;
747  for (i = 0;i < n;++i) {
748  dgtest += (x[i] - xp[i]) * gp[i];
749  }
750 
751  if (*f <= finit + param->ftol * dgtest) {
752  /* The sufficient decrease condition. */
753  return count;
754  }
755 
756  if (*stp < param->min_step) {
757  /* The step is the minimum value. */
758  return LBFGSERR_MINIMUMSTEP;
759  }
760  if (*stp > param->max_step) {
761  /* The step is the maximum value. */
762  return LBFGSERR_MAXIMUMSTEP;
763  }
764  if (param->max_linesearch <= count) {
765  /* Maximum number of iteration. */
767  }
768 
769  (*stp) *= width;
770  }
771 }
772 
773 
774 
775 static int32_t line_search_morethuente(
776  int32_t n,
777  float64_t *x,
778  float64_t *f,
781  float64_t *stp,
782  const SGVector<float64_t>& xp,
783  const SGVector<float64_t>& gp,
785  callback_data_t *cd,
786  const lbfgs_parameter_t *param
787  )
788 {
789  int32_t count = 0;
790  int32_t brackt, stage1, uinfo = 0;
791  float64_t dg;
792  float64_t stx, fx, dgx;
793  float64_t sty, fy, dgy;
794  float64_t fxm, dgxm, fym, dgym, fm, dgm;
795  float64_t finit, ftest1, dginit, dgtest;
796  float64_t width, prev_width;
797  float64_t stmin, stmax;
798 
799  /* Check the input parameters for errors. */
800  if (*stp <= 0.) {
802  }
803 
804  /* Compute the initial gradient in the search direction. */
805  dginit = linalg::dot(g, s);
806 
807  /* Make sure that s points to a descent direction. */
808  if (0 < dginit) {
810  }
811 
812  /* Initialize local variables. */
813  brackt = 0;
814  stage1 = 1;
815  finit = *f;
816  dgtest = param->ftol * dginit;
817  width = param->max_step - param->min_step;
818  prev_width = 2.0 * width;
819 
820  /*
821  The variables stx, fx, dgx contain the values of the step,
822  function, and directional derivative at the best step.
823  The variables sty, fy, dgy contain the value of the step,
824  function, and derivative at the other endpoint of
825  the interval of uncertainty.
826  The variables stp, f, dg contain the values of the step,
827  function, and derivative at the current step.
828  */
829  stx = sty = 0.;
830  fx = fy = finit;
831  dgx = dgy = dginit;
832 
833  for (;;) {
834  /*
835  Set the minimum and maximum steps to correspond to the
836  present interval of uncertainty.
837  */
838  if (brackt) {
839  stmin = min2(stx, sty);
840  stmax = max2(stx, sty);
841  } else {
842  stmin = stx;
843  stmax = *stp + 4.0 * (*stp - stx);
844  }
845 
846  /* Clip the step in the range of [stpmin, stpmax]. */
847  if (*stp < param->min_step) *stp = param->min_step;
848  if (param->max_step < *stp) *stp = param->max_step;
849 
850  /*
851  If an unusual termination is to occur then let
852  stp be the lowest point obtained so far.
853  */
854  if ((brackt && ((*stp <= stmin || stmax <= *stp) || param->max_linesearch <= count + 1 || uinfo != 0)) || (brackt && (stmax - stmin <= param->xtol * stmax))) {
855  *stp = stx;
856  }
857 
858  /*
859  Compute the current value of x:
860  x <- x + (*stp) * s.
861  */
862  sg_memcpy(x, xp.vector, n*sizeof(float64_t));
863  if (cd->proc_adjust_step)
864  *stp=cd->proc_adjust_step(cd->instance, x, s.vector, cd->n, *stp);
865 
866  SGVector<float64_t>::add(x, 1, x, *stp, s, n);
867 
868  /* Evaluate the function and gradient values. */
869  *f = cd->proc_evaluate(cd->instance, x, g.vector, cd->n, *stp);
870 
871  dg = linalg::dot(g, s);
872 
873  ftest1 = finit + *stp * dgtest;
874  ++count;
875 
876  /* Test for errors and convergence. */
877  if (brackt && ((*stp <= stmin || stmax <= *stp) || uinfo != 0)) {
878  /* Rounding errors prevent further progress. */
880  }
881  if (*stp == param->max_step && *f <= ftest1 && dg <= dgtest) {
882  /* The step is the maximum value. */
883  return LBFGSERR_MAXIMUMSTEP;
884  }
885  if (*stp == param->min_step && (ftest1 < *f || dgtest <= dg)) {
886  /* The step is the minimum value. */
887  return LBFGSERR_MINIMUMSTEP;
888  }
889  if (brackt && (stmax - stmin) <= param->xtol * stmax) {
890  /* Relative width of the interval of uncertainty is at most xtol. */
891  return LBFGSERR_WIDTHTOOSMALL;
892  }
893  if (param->max_linesearch <= count) {
894  /* Maximum number of iteration. */
896  }
897  if (*f <= ftest1 && fabs(dg) <= param->gtol * (-dginit)) {
898  /* The sufficient decrease condition and the directional derivative condition hold. */
899  return count;
900  }
901 
902  /*
903  In the first stage we seek a step for which the modified
904  function has a nonpositive value and nonnegative derivative.
905  */
906  if (stage1 && *f <= ftest1 && min2(param->ftol, param->gtol) * dginit <= dg) {
907  stage1 = 0;
908  }
909 
910  /*
911  A modified function is used to predict the step only if
912  we have not obtained a step for which the modified
913  function has a nonpositive function value and nonnegative
914  derivative, and if a lower function value has been
915  obtained but the decrease is not sufficient.
916  */
917  if (stage1 && ftest1 < *f && *f <= fx) {
918  /* Define the modified function and derivative values. */
919  fm = *f - *stp * dgtest;
920  fxm = fx - stx * dgtest;
921  fym = fy - sty * dgtest;
922  dgm = dg - dgtest;
923  dgxm = dgx - dgtest;
924  dgym = dgy - dgtest;
925 
926  /*
927  Call update_trial_interval() to update the interval of
928  uncertainty and to compute the new step.
929  */
930  uinfo = update_trial_interval(
931  &stx, &fxm, &dgxm,
932  &sty, &fym, &dgym,
933  stp, &fm, &dgm,
934  stmin, stmax, &brackt
935  );
936 
937  /* Reset the function and gradient values for f. */
938  fx = fxm + stx * dgtest;
939  fy = fym + sty * dgtest;
940  dgx = dgxm + dgtest;
941  dgy = dgym + dgtest;
942  } else {
943  /*
944  Call update_trial_interval() to update the interval of
945  uncertainty and to compute the new step.
946  */
947  uinfo = update_trial_interval(
948  &stx, &fx, &dgx,
949  &sty, &fy, &dgy,
950  stp, f, &dg,
951  stmin, stmax, &brackt
952  );
953  }
954 
955  /*
956  Force a sufficient decrease in the interval of uncertainty.
957  */
958  if (brackt) {
959  if (0.66 * prev_width <= fabs(sty - stx)) {
960  *stp = stx + 0.5 * (sty - stx);
961  }
962  prev_width = width;
963  width = fabs(sty - stx);
964  }
965  }
966 
967  return LBFGSERR_LOGICERROR;
968 }
969 
970 
971 
975 #define USES_MINIMIZER \
976  float64_t a, d, gamma, theta, p, q, r, s;
977 
988 #define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv) \
989  d = (v) - (u); \
990  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
991  p = fabs(theta); \
992  q = fabs(du); \
993  r = fabs(dv); \
994  s = max3(p, q, r); \
995  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
996  a = theta / s; \
997  gamma = s * sqrt(a * a - ((du) / s) * ((dv) / s)); \
998  if ((v) < (u)) gamma = -gamma; \
999  p = gamma - (du) + theta; \
1000  q = gamma - (du) + gamma + (dv); \
1001  r = p / q; \
1002  (cm) = (u) + r * d;
1003 
1016 #define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax) \
1017  d = (v) - (u); \
1018  theta = ((fu) - (fv)) * 3 / d + (du) + (dv); \
1019  p = fabs(theta); \
1020  q = fabs(du); \
1021  r = fabs(dv); \
1022  s = max3(p, q, r); \
1023  /* gamma = s*sqrt((theta/s)**2 - (du/s) * (dv/s)) */ \
1024  a = theta / s; \
1025  gamma = s * sqrt(max2(0, a * a - ((du) / s) * ((dv) / s))); \
1026  if ((u) < (v)) gamma = -gamma; \
1027  p = gamma - (dv) + theta; \
1028  q = gamma - (dv) + gamma + (du); \
1029  r = p / q; \
1030  if (r < 0. && gamma != 0.) { \
1031  (cm) = (v) - r * d; \
1032  } else if (a < 0) { \
1033  (cm) = (xmax); \
1034  } else { \
1035  (cm) = (xmin); \
1036  }
1037 
1047 #define QUARD_MINIMIZER(qm, u, fu, du, v, fv) \
1048  a = (v) - (u); \
1049  (qm) = (u) + (du) / (((fu) - (fv)) / a + (du)) / 2 * a;
1050 
1059 #define QUARD_MINIMIZER2(qm, u, du, v, dv) \
1060  a = (u) - (v); \
1061  (qm) = (v) + (dv) / ((dv) - (du)) * a;
1062 
1063 #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
1064 
1094 static int32_t update_trial_interval(
1095  float64_t *x,
1096  float64_t *fx,
1097  float64_t *dx,
1098  float64_t *y,
1099  float64_t *fy,
1100  float64_t *dy,
1101  float64_t *t,
1102  float64_t *ft,
1103  float64_t *dt,
1104  const float64_t tmin,
1105  const float64_t tmax,
1106  int32_t *brackt
1107  )
1108 {
1109  int32_t bound;
1110  int32_t dsign = fsigndiff(dt, dx);
1111  float64_t mc; /* minimizer of an interpolated cubic. */
1112  float64_t mq; /* minimizer of an interpolated quadratic. */
1113  float64_t newt; /* new trial value. */
1114  USES_MINIMIZER; /* for CUBIC_MINIMIZER and QUARD_MINIMIZER. */
1115 
1116  /* Check the input parameters for errors. */
1117  if (*brackt) {
1118  if (*t <= min2(*x, *y) || max2(*x, *y) <= *t) {
1119  /* The trival value t is out of the interval. */
1120  return LBFGSERR_OUTOFINTERVAL;
1121  }
1122  if (0. <= *dx * (*t - *x)) {
1123  /* The function must decrease from x. */
1125  }
1126  if (tmax < tmin) {
1127  /* Incorrect tmin and tmax specified. */
1129  }
1130  }
1131 
1132  /*
1133  Trial value selection.
1134  */
1135  if (*fx < *ft) {
1136  /*
1137  Case 1: a higher function value.
1138  The minimum is brackt. If the cubic minimizer is closer
1139  to x than the quadratic one, the cubic one is taken, else
1140  the average of the minimizers is taken.
1141  */
1142  *brackt = 1;
1143  bound = 1;
1144  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1145  QUARD_MINIMIZER(mq, *x, *fx, *dx, *t, *ft);
1146  if (fabs(mc - *x) < fabs(mq - *x)) {
1147  newt = mc;
1148  } else {
1149  newt = mc + 0.5 * (mq - mc);
1150  }
1151  } else if (dsign) {
1152  /*
1153  Case 2: a lower function value and derivatives of
1154  opposite sign. The minimum is brackt. If the cubic
1155  minimizer is closer to x than the quadratic (secant) one,
1156  the cubic one is taken, else the quadratic one is taken.
1157  */
1158  *brackt = 1;
1159  bound = 0;
1160  CUBIC_MINIMIZER(mc, *x, *fx, *dx, *t, *ft, *dt);
1161  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1162  if (fabs(mc - *t) > fabs(mq - *t)) {
1163  newt = mc;
1164  } else {
1165  newt = mq;
1166  }
1167  } else if (fabs(*dt) < fabs(*dx)) {
1168  /*
1169  Case 3: a lower function value, derivatives of the
1170  same sign, and the magnitude of the derivative decreases.
1171  The cubic minimizer is only used if the cubic tends to
1172  infinity in the direction of the minimizer or if the minimum
1173  of the cubic is beyond t. Otherwise the cubic minimizer is
1174  defined to be either tmin or tmax. The quadratic (secant)
1175  minimizer is also computed and if the minimum is brackt
1176  then the the minimizer closest to x is taken, else the one
1177  farthest away is taken.
1178  */
1179  bound = 1;
1180  CUBIC_MINIMIZER2(mc, *x, *fx, *dx, *t, *ft, *dt, tmin, tmax);
1181  QUARD_MINIMIZER2(mq, *x, *dx, *t, *dt);
1182  if (*brackt) {
1183  if (fabs(*t - mc) < fabs(*t - mq)) {
1184  newt = mc;
1185  } else {
1186  newt = mq;
1187  }
1188  } else {
1189  if (fabs(*t - mc) > fabs(*t - mq)) {
1190  newt = mc;
1191  } else {
1192  newt = mq;
1193  }
1194  }
1195  } else {
1196  /*
1197  Case 4: a lower function value, derivatives of the
1198  same sign, and the magnitude of the derivative does
1199  not decrease. If the minimum is not brackt, the step
1200  is either tmin or tmax, else the cubic minimizer is taken.
1201  */
1202  bound = 0;
1203  if (*brackt) {
1204  CUBIC_MINIMIZER(newt, *t, *ft, *dt, *y, *fy, *dy);
1205  } else if (*x < *t) {
1206  newt = tmax;
1207  } else {
1208  newt = tmin;
1209  }
1210  }
1211 
1212  /*
1213  Update the interval of uncertainty. This update does not
1214  depend on the new step or the case analysis above.
1215 
1216  - Case a: if f(x) < f(t),
1217  x <- x, y <- t.
1218  - Case b: if f(t) <= f(x) && f'(t)*f'(x) > 0,
1219  x <- t, y <- y.
1220  - Case c: if f(t) <= f(x) && f'(t)*f'(x) < 0,
1221  x <- t, y <- x.
1222  */
1223  if (*fx < *ft) {
1224  /* Case a */
1225  *y = *t;
1226  *fy = *ft;
1227  *dy = *dt;
1228  } else {
1229  /* Case c */
1230  if (dsign) {
1231  *y = *x;
1232  *fy = *fx;
1233  *dy = *dx;
1234  }
1235  /* Cases b and c */
1236  *x = *t;
1237  *fx = *ft;
1238  *dx = *dt;
1239  }
1240 
1241  /* Clip the new trial value in [tmin, tmax]. */
1242  if (tmax < newt) newt = tmax;
1243  if (newt < tmin) newt = tmin;
1244 
1245  /*
1246  Redefine the new trial value if it is close to the upper bound
1247  of the interval.
1248  */
1249  if (*brackt && bound) {
1250  mq = *x + 0.66 * (*y - *x);
1251  if (*x < *y) {
1252  if (mq < newt) newt = mq;
1253  } else {
1254  if (newt < mq) newt = mq;
1255  }
1256  }
1257 
1258  /* Return the new trial value. */
1259  *t = newt;
1260  return 0;
1261 }
1262 
1263 
1264 
1265 
1266 
1268  const float64_t* x,
1269  const int32_t start,
1270  const int32_t n
1271  )
1272 {
1273  int32_t i;
1274  float64_t norm = 0.;
1275 
1276  for (i = start;i < n;++i) {
1277  norm += fabs(x[i]);
1278  }
1279 
1280  return norm;
1281 }
1282 
1284  float64_t* pg,
1285  const float64_t* x,
1286  const float64_t* g,
1287  const int32_t n,
1288  const float64_t c,
1289  const int32_t start,
1290  const int32_t end
1291  )
1292 {
1293  int32_t i;
1294 
1295  /* Compute the negative of gradients. */
1296  for (i = 0;i < start;++i) {
1297  pg[i] = g[i];
1298  }
1299 
1300  /* Compute the psuedo-gradients. */
1301  for (i = start;i < end;++i) {
1302  if (x[i] < 0.) {
1303  /* Differentiable. */
1304  pg[i] = g[i] - c;
1305  } else if (0. < x[i]) {
1306  /* Differentiable. */
1307  pg[i] = g[i] + c;
1308  } else {
1309  if (g[i] < -c) {
1310  /* Take the right partial derivative. */
1311  pg[i] = g[i] + c;
1312  } else if (c < g[i]) {
1313  /* Take the left partial derivative. */
1314  pg[i] = g[i] - c;
1315  } else {
1316  pg[i] = 0.;
1317  }
1318  }
1319  }
1320 
1321  for (i = end;i < n;++i) {
1322  pg[i] = g[i];
1323  }
1324 }
1325 
1326 static void owlqn_project(
1327  float64_t* d,
1328  const float64_t* sign,
1329  const int32_t start,
1330  const int32_t end
1331  )
1332 {
1333  int32_t i;
1334 
1335  for (i = start;i < end;++i) {
1336  if (d[i] * sign[i] <= 0) {
1337  d[i] = 0;
1338  }
1339  }
1340 }
1341 
1342 }
int32_t(* line_search_proc)(int32_t n, float64_t *x, float64_t *f, SGVector< float64_t > &g, SGVector< float64_t > &s, float64_t *stp, const SGVector< float64_t > &xp, const SGVector< float64_t > &gp, SGVector< float64_t > &wa, callback_data_t *cd, const lbfgs_parameter_t *param)
Definition: lbfgs.cpp:110
int32_t lbfgs(int32_t n, float64_t *x, float64_t *ptr_fx, lbfgs_evaluate_t proc_evaluate, lbfgs_progress_t proc_progress, void *instance, lbfgs_parameter_t *_param, lbfgs_adjust_step_t proc_adjust_step)
Definition: lbfgs.cpp:210
int32_t index_t
Definition: common.h:72
static void owlqn_project(float64_t *d, const float64_t *sign, const int32_t start, const int32_t end)
Definition: lbfgs.cpp:1326
static T sqrt(T x)
Definition: Math.h:428
Class ShogunException defines an exception which is thrown whenever an error inside of shogun occurs...
void scale(SGVector< T > &a, SGVector< T > &result, T alpha=1)
#define QUARD_MINIMIZER(qm, u, fu, du, v, fv)
Definition: lbfgs.cpp:1047
float64_t(* lbfgs_evaluate_t)(void *instance, const float64_t *x, float64_t *g, const int n, const float64_t step)
Definition: lbfgs.h:357
float64_t orthantwise_c
Definition: lbfgs.h:313
static const lbfgs_parameter_t _defparam
Definition: lbfgs.cpp:101
void add(SGVector< T > &a, SGVector< T > &b, SGVector< T > &result, T alpha=1, T beta=1)
T dot(const SGVector< T > &a, const SGVector< T > &b)
#define CUBIC_MINIMIZER2(cm, u, fu, du, v, fv, dv, xmin, xmax)
Definition: lbfgs.cpp:1016
lbfgs_progress_t proc_progress
Definition: lbfgs.cpp:88
#define QUARD_MINIMIZER2(qm, u, du, v, dv)
Definition: lbfgs.cpp:1059
lbfgs_adjust_step_t proc_adjust_step
Definition: lbfgs.cpp:89
static int32_t line_search_backtracking(int32_t n, float64_t *x, float64_t *f, SGVector< float64_t > &g, SGVector< float64_t > &s, float64_t *stp, const SGVector< float64_t > &xp, const SGVector< float64_t > &gp, SGVector< float64_t > &wa, callback_data_t *cd, const lbfgs_parameter_t *param)
Definition: lbfgs.cpp:582
void add(const SGVector< T > x)
Definition: SGVector.cpp:325
static int32_t update_trial_interval(float64_t *x, float64_t *fx, float64_t *dx, float64_t *y, float64_t *fy, float64_t *dy, float64_t *t, float64_t *ft, float64_t *dt, const float64_t tmin, const float64_t tmax, int32_t *brackt)
Definition: lbfgs.cpp:1094
float64_t(* lbfgs_adjust_step_t)(void *instance, const float64_t *x, const float64_t *d, const int n, const float64_t step)
Definition: lbfgs.h:415
static int32_t line_search_morethuente(int32_t n, float64_t *x, float64_t *f, SGVector< float64_t > &g, SGVector< float64_t > &s, float64_t *stp, const SGVector< float64_t > &xp, const SGVector< float64_t > &gp, SGVector< float64_t > &wa, callback_data_t *cd, const lbfgs_parameter_t *param)
Definition: lbfgs.cpp:775
SGVector< float64_t > y
Definition: lbfgs.cpp:96
T norm(const SGVector< T > &a)
double float64_t
Definition: common.h:60
static void owlqn_pseudo_gradient(float64_t *pg, const float64_t *x, const float64_t *g, const int32_t n, const float64_t c, const int32_t start, const int32_t end)
Definition: lbfgs.cpp:1283
SGVector< float64_t > s
Definition: lbfgs.cpp:95
static float64_t owlqn_x1norm(const float64_t *x, const int32_t start, const int32_t n)
Definition: lbfgs.cpp:1267
static int32_t line_search_backtracking_owlqn(int32_t n, float64_t *x, float64_t *f, SGVector< float64_t > &g, SGVector< float64_t > &s, float64_t *stp, const SGVector< float64_t > &xp, const SGVector< float64_t > &gp, SGVector< float64_t > &wa, callback_data_t *cd, const lbfgs_parameter_t *param)
Definition: lbfgs.cpp:698
int(* lbfgs_progress_t)(void *instance, const float64_t *x, const float64_t *g, const float64_t fx, const float64_t xnorm, const float64_t gnorm, const float64_t step, int n, int k, int ls)
Definition: lbfgs.h:385
#define USES_MINIMIZER
Definition: lbfgs.cpp:975
#define CUBIC_MINIMIZER(cm, u, fu, du, v, fv, dv)
Definition: lbfgs.cpp:988
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
static int is_infinity(double f)
checks whether a float is infinity
Definition: Math.cpp:215
#define fsigndiff(x, y)
Definition: lbfgs.cpp:1063
static int is_nan(double f)
checks whether a float is nan
Definition: Math.cpp:210
lbfgs_evaluate_t proc_evaluate
Definition: lbfgs.cpp:87
#define min2(a, b)
Definition: lbfgs.cpp:80
#define max2(a, b)
Definition: lbfgs.cpp:81
void lbfgs_parameter_init(lbfgs_parameter_t *param)
Definition: lbfgs.cpp:205

SHOGUN Machine Learning Toolbox - Documentation