SHOGUN  6.1.3
MultiLaplaceInferenceMethod.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  *
31  * Code adapted from
32  * https://gist.github.com/yorkerlin/14ace49f2278f3859614
33  * Gaussian Process Machine Learning Toolbox
34  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
35  * and
36  * GPstuff - Gaussian process models for Bayesian analysis
37  * http://becs.aalto.fi/en/research/bayes/gpstuff/
38  *
39  * The reference pseudo code is the algorithm 3.3 of the GPML textbook
40  */
41 
43 
47 #ifdef USE_GPL_SHOGUN
48 #include <shogun/lib/external/brent.h>
49 #endif //USE_GPL_SHOGUN
50 
51 using namespace shogun;
52 using namespace Eigen;
53 
54 namespace shogun
55 {
56 
57 #ifndef DOXYGEN_SHOULD_SKIP_THIS
58 #ifdef USE_GPL_SHOGUN
59 
60 class CMultiPsiLine : public func_base
61 {
62 public:
63  float64_t log_scale;
64  MatrixXd K;
65  VectorXd dalpha;
66  VectorXd start_alpha;
67  Map<VectorXd>* alpha;
71  CLikelihoodModel* lik;
72  CLabels* lab;
73 
74  virtual double operator() (double x)
75  {
76  const index_t C=((CMulticlassLabels*)lab)->get_num_classes();
77  const index_t n=((CMulticlassLabels*)lab)->get_num_labels();
78  Map<VectorXd> eigen_f(f->vector, f->vlen);
79  Map<VectorXd> eigen_m(m->vector, m->vlen);
80 
81  // compute alpha=alpha+x*dalpha and f=K*alpha+m
82  (*alpha)=start_alpha+x*dalpha;
83 
84  float64_t result=0;
85  for(index_t bl=0; bl<C; bl++)
86  {
87  eigen_f.block(bl*n,0,n,1)=K*alpha->block(bl*n,0,n,1)*CMath::exp(log_scale*2.0);
88  result+=alpha->block(bl*n,0,n,1).dot(eigen_f.block(bl*n,0,n,1))/2.0;
89  eigen_f.block(bl*n,0,n,1)+=eigen_m;
90  }
91 
92  // get first and second derivatives of log likelihood
93  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
94 
95  result -=SGVector<float64_t>::sum(lik->get_log_probability_f(lab, *f));
96 
97  return result;
98  }
99 };
100 #endif //USE_GPL_SHOGUN
101 
102 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
103 
105 {
106  init();
107 }
108 
110  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
111  : CLaplaceInference(kern, feat, m, lab, mod)
112 {
113  init();
114 }
115 
116 void CMultiLaplaceInferenceMethod::init()
117 {
118  m_iter=20;
119  m_tolerance=1e-6;
120  m_opt_tolerance=1e-10;
121  m_opt_max=10;
122 
123  m_nlz=0;
124  SG_ADD(&m_nlz, "nlz", "negative log marginal likelihood ", MS_NOT_AVAILABLE);
125  SG_ADD(&m_U, "U", "the matrix used to compute gradient wrt hyperparameters", MS_NOT_AVAILABLE);
126 
127  SG_ADD(&m_tolerance, "tolerance", "amount of tolerance for Newton's iterations", MS_NOT_AVAILABLE);
128  SG_ADD(&m_iter, "iter", "max Newton's iterations", MS_NOT_AVAILABLE);
129  SG_ADD(&m_opt_tolerance, "opt_tolerance", "amount of tolerance for Brent's minimization method", MS_NOT_AVAILABLE);
130  SG_ADD(&m_opt_max, "opt_max", "max iterations for Brent's minimization method", MS_NOT_AVAILABLE);
131 }
132 
134 {
135 }
136 
138 {
140 
142  "Labels must be type of CMulticlassLabels\n");
144  "likelihood model should support multi-classification\n");
145 }
146 
148 {
150  update();
151 
152  get_dpi_helper();
153 
154  return SGVector<float64_t>(m_W);
155 }
156 
158 {
160  update();
161 
162  return m_nlz;
163 }
164 
166  const TParameter* param)
167 {
168  //SoftMax likelihood does not have this kind of derivative
169  SG_ERROR("Not Implemented!\n");
170  return SGVector<float64_t> ();
171 }
172 
174  CInference* inference)
175 {
176  if (inference==NULL)
177  return NULL;
178 
179  if (inference->get_inference_type()!=INF_LAPLACE_MULTIPLE)
180  SG_SERROR("Provided inference is not of type CMultiLaplaceInferenceMethod!\n")
181 
182  SG_REF(inference);
183  return (CMultiLaplaceInferenceMethod*)inference;
184 }
185 
186 
188 {
189  //Sigma=K-K*(E-E*R(M*M')^{-1}*R'*E)*K
190  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
191  const index_t n=m_labels->get_num_labels();
195 
196  m_Sigma=SGMatrix<float64_t>(C*n, C*n);
198  eigen_Sigma.fill(0);
199 
200  MatrixXd eigen_U(C*n,n);
201  for(index_t bl=0; bl<C; bl++)
202  {
203  eigen_U.block(bl*n,0,n,n)=eigen_K*CMath::exp(m_log_scale*2.0)*eigen_E.block(0,bl*n,n,n);
204  eigen_Sigma.block(bl*n,bl*n,n,n)=(MatrixXd::Identity(n,n)-eigen_U.block(bl*n,0,n,n))*(eigen_K*CMath::exp(m_log_scale*2.0));
205  }
206  MatrixXd eigen_V=eigen_M.triangularView<Upper>().adjoint().solve(eigen_U.transpose());
207  eigen_Sigma+=eigen_V.transpose()*eigen_V;
208 }
209 
211 {
212 }
213 
215 {
216  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
217  const index_t n=m_labels->get_num_labels();
218  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
219  Map<MatrixXd> eigen_dpi_matrix(eigen_dpi.data(),n,C);
220 
221  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
222  Map<MatrixXd> eigen_mu_matrix(eigen_mu.data(),n,C);
223  // with log_sum_exp trick
224  VectorXd max_coeff=eigen_mu_matrix.rowwise().maxCoeff();
225  eigen_dpi_matrix=eigen_mu_matrix.array().colwise()-max_coeff.array();
226  VectorXd log_sum_exp=((eigen_dpi_matrix.array().exp()).rowwise().sum()).array().log();
227  eigen_dpi_matrix=(eigen_dpi_matrix.array().colwise()-log_sum_exp.array()).exp();
228 
229  // without log_sum_exp trick
230  //eigen_dpi_matrix=eigen_mu_matrix.array().exp();
231  //VectorXd tmp_for_dpi=eigen_dpi_matrix.rowwise().sum();
232  //eigen_dpi_matrix=eigen_dpi_matrix.array().colwise()/tmp_for_dpi.array();
233 }
234 
236 {
237  float64_t Psi_Old = CMath::INFTY;
238  float64_t Psi_New;
239  float64_t Psi_Def;
240  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
241  const index_t n=m_labels->get_num_labels();
242 
243  // get mean vector and create eigen representation of it
245  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
246  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
247 
248  // create eigen representation of kernel matrix
250 
251  // create shogun and eigen representation of function vector
252  m_mu=SGVector<float64_t>(mean.vlen*C);
253  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
254 
255  // f = mean as default value
256  eigen_mu=eigen_mean;
257 
259  m_labels, m_mu));
260 
261  if (m_alpha.vlen!=C*n)
262  {
263  // set alpha a zero vector
265  m_alpha.zero();
266  Psi_New=Psi_Def;
267  m_E=SGMatrix<float64_t>(n,C*n);
270  }
271  else
272  {
274  for(index_t bl=0; bl<C; bl++)
275  eigen_mu.block(bl*n,0,n,1)=eigen_ktrtr*CMath::exp(m_log_scale*2.0)*alpha.block(bl*n,0,n,1);
276 
277  //alpha'*(f-m)/2.0
278  Psi_New=alpha.dot(eigen_mu)/2.0;
279  // compute f = K * alpha + m
280  eigen_mu+=eigen_mean;
281 
283 
284  // if default is better, then use it
285  if (Psi_Def < Psi_New)
286  {
287  m_alpha.zero();
288  eigen_mu=eigen_mean;
289  Psi_New=Psi_Def;
290  }
291  }
292 
293  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
294  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
297 
298  // get first derivative of log probability function
300 
301  index_t iter=0;
302  Map<MatrixXd> & eigen_M=eigen_L;
303 
304  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
305  {
306  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
307 
308  get_dpi_helper();
309  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
310 
311 
312  Psi_Old = Psi_New;
313  iter++;
314 
315  m_nlz=0;
316 
317  for(index_t bl=0; bl<C; bl++)
318  {
319  VectorXd eigen_sD=eigen_dpi.block(bl*n,0,n,1).cwiseSqrt();
320  LLT<MatrixXd> chol_tmp((eigen_sD*eigen_sD.transpose()).cwiseProduct(eigen_ktrtr*CMath::exp(m_log_scale*2.0))+
321  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
322  MatrixXd eigen_L_tmp=chol_tmp.matrixU();
323  MatrixXd eigen_E_bl=eigen_L_tmp.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sD.asDiagonal()));
324  eigen_E_bl=eigen_E_bl.transpose()*eigen_E_bl;
325  eigen_E.block(0,bl*n,n,n)=eigen_E_bl;
326  if (bl==0)
327  eigen_M=eigen_E_bl;
328  else
329  eigen_M+=eigen_E_bl;
330  m_nlz+=eigen_L_tmp.diagonal().array().log().sum();
331  }
332 
333  LLT<MatrixXd> chol_tmp(eigen_M);
334  eigen_M = chol_tmp.matrixU();
335  m_nlz+=eigen_M.diagonal().array().log().sum();
336 
337  VectorXd eigen_b=eigen_dlp;
338  Map<VectorXd> & tmp1=eigen_dlp;
339  tmp1=eigen_dpi.array()*(eigen_mu-eigen_mean).array();
340  Map<MatrixXd> m_tmp(tmp1.data(),n,C);
341  VectorXd tmp2=m_tmp.array().rowwise().sum();
342 
343  for(index_t bl=0; bl<C; bl++)
344  eigen_b.block(bl*n,0,n,1)+=eigen_dpi.block(bl*n,0,n,1).cwiseProduct(eigen_mu.block(bl*n,0,n,1)-eigen_mean_bl-tmp2);
345 
346  Map<VectorXd> &eigen_c=eigen_W;
347  for(index_t bl=0; bl<C; bl++)
348  eigen_c.block(bl*n,0,n,1)=eigen_E.block(0,bl*n,n,n)*(eigen_ktrtr*CMath::exp(m_log_scale*2.0)*eigen_b.block(bl*n,0,n,1));
349 
350  Map<MatrixXd> c_tmp(eigen_c.data(),n,C);
351 
352  VectorXd tmp3=c_tmp.array().rowwise().sum();
353  VectorXd tmp4=eigen_M.triangularView<Upper>().adjoint().solve(tmp3);
354 
355  VectorXd &eigen_dalpha=eigen_b;
356  eigen_dalpha+=eigen_E.transpose()*(eigen_M.triangularView<Upper>().solve(tmp4))-eigen_c-eigen_alpha;
357 #ifdef USE_GPL_SHOGUN
358  // perform Brent's optimization
359  CMultiPsiLine func;
360 
361  func.log_scale=m_log_scale;
362  func.K=eigen_ktrtr;
363  func.dalpha=eigen_dalpha;
364  func.start_alpha=eigen_alpha;
365  func.alpha=&eigen_alpha;
366  func.dlp=&m_dlp;
367  func.f=&m_mu;
368  func.m=&mean;
369  func.lik=m_model;
370  func.lab=m_labels;
371 
372  float64_t x;
373  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
374 #else
376 #endif //USE_GPL_SHOGUN
377  m_nlz+=Psi_New;
378  }
379 
380  if (Psi_Old-Psi_New>m_tolerance && iter>=m_iter)
381  {
382  SG_WARNING("Max iterations (%d) reached, but convergence level (%f) is not yet below tolerance (%f)\n", m_iter, Psi_Old-Psi_New, m_tolerance);
383  }
384 }
385 
387 {
388  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
389  const index_t n=m_labels->get_num_labels();
390  m_U=SGMatrix<float64_t>(n, n*C);
394  eigen_U=eigen_M.triangularView<Upper>().adjoint().solve(eigen_E);
395 }
396 
398 {
399  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
400  //currently only explicit term is computed
401  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
402  const index_t n=m_labels->get_num_labels();
405  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
406  float64_t result=0;
407  //currently only explicit term is computed
408  for(index_t bl=0; bl<C; bl++)
409  {
410  result+=((eigen_E.block(0,bl*n,n,n)-eigen_U.block(0,bl*n,n,n).transpose()*eigen_U.block(0,bl*n,n,n)).array()
411  *eigen_dK.array()).sum();
412  result-=(eigen_dK*eigen_alpha.block(bl*n,0,n,1)).dot(eigen_alpha.block(bl*n,0,n,1));
413  }
414 
415  return result/2.0;
416 }
417 
419  const TParameter* param)
420 {
421  REQUIRE(!strcmp(param->m_name, "log_scale"), "Can't compute derivative of "
422  "the nagative log marginal likelihood wrt %s.%s parameter\n",
423  get_name(), param->m_name)
424 
426 
427  SGVector<float64_t> result(1);
428 
429  // compute derivative K wrt scale
430 
431  result[0]=get_derivative_helper(m_ktrtr);
432  result[0]*=CMath::exp(m_log_scale*2.0)*2.0;
433 
434  return result;
435 }
436 
438  const TParameter* param)
439 {
440  // create eigen representation of K, Z, dfhat, dlp and alpha
442 
443  REQUIRE(param, "Param not set\n");
444  SGVector<float64_t> result;
445  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
446  result=SGVector<float64_t>(len);
447 
448  for (index_t i=0; i<result.vlen; i++)
449  {
451 
452  if (result.vlen==1)
453  dK=m_kernel->get_parameter_gradient(param);
454  else
455  dK=m_kernel->get_parameter_gradient(param, i);
456 
457  result[i]=get_derivative_helper(dK);
458  result[i]*=CMath::exp(m_log_scale*2.0);
459  }
460 
461  return result;
462 }
463 
465  const TParameter* param)
466 {
467  // create eigen representation of K, Z, dfhat and alpha
469  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
470  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
471  const index_t n=m_labels->get_num_labels();
472 
473  REQUIRE(param, "Param not set\n");
474  SGVector<float64_t> result;
475  int64_t len=const_cast<TParameter *>(param)->m_datatype.get_num_elements();
476  result=SGVector<float64_t>(len);
477 
478  for (index_t i=0; i<result.vlen; i++)
479  {
481 
482  if (result.vlen==1)
484  else
486 
487  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
488 
489  result[i]=0;
490  //currently only compute the explicit term
491  for(index_t bl=0; bl<C; bl++)
492  result[i]-=eigen_alpha.block(bl*n,0,n,1).dot(eigen_dmu);
493  }
494 
495  return result;
496 }
497 
499 {
501 
503  Map<VectorXd> eigen_res(res.vector, res.vlen);
504  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
505 
507  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
508  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
509 
510  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
511  eigen_res=eigen_mu-eigen_mean;
512 
513  return res;
514 }
515 
516 
517 }
518 
float64_t m_log_scale
Definition: Inference.h:485
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
virtual bool supports_multiclass() const
virtual float64_t get_derivative_helper(SGMatrix< float64_t > dK)
virtual ELabelType get_label_type() const =0
SGVector< float64_t > m_mu
int32_t index_t
Definition: common.h:72
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:1868
virtual EInferenceType get_inference_type() const
Definition: Inference.h:104
virtual int32_t get_num_labels() const =0
The Laplace approximation inference method class for multi classification.
multi-class labels 0,1,...
Definition: LabelTypes.h:20
CKernel * m_kernel
Definition: Inference.h:464
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:418
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)
Definition: SGMatrix.h:25
virtual SGVector< float64_t > get_diagonal_vector()
parameter struct
#define SG_ERROR(...)
Definition: SGIO.h:128
#define REQUIRE(x,...)
Definition: SGIO.h:181
T dot(const SGVector< T > &a, const SGVector< T > &b)
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
virtual SGVector< float64_t > get_posterior_mean()
SGMatrix< float64_t > m_E
Definition: Inference.h:491
An abstract class of the mean function.
Definition: MeanFunction.h:49
std::enable_if<!std::is_same< T, complex128_t >::value, float64_t >::type mean(const Container< T > &a)
#define SG_REF(x)
Definition: SGObject.h:52
CFeatures * m_features
Definition: Inference.h:473
SGMatrix< float64_t > m_ktrtr
Definition: Inference.h:488
CMeanFunction * m_mean
Definition: Inference.h:467
Multiclass Labels for multi-class classification.
#define SG_GPL_ONLY
Definition: SGIO.h:139
CLabels * m_labels
Definition: Inference.h:476
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)
double float64_t
Definition: common.h:60
SGVector< float64_t > m_dlp
index_t num_rows
Definition: SGMatrix.h:495
SGMatrix< float64_t > m_L
Definition: Inference.h:482
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
Definition: KLInference.h:52
index_t num_cols
Definition: SGMatrix.h:497
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:73
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
The Laplace approximation inference method base class.
T sum(const Container< T > &a, bool no_diag=false)
The Inference Method base class.
Definition: Inference.h:81
SGMatrix< float64_t > m_Sigma
The class Features is the base class of all feature objects.
Definition: Features.h:69
#define SG_SERROR(...)
Definition: SGIO.h:164
static float64_t exp(float64_t x)
Definition: Math.h:551
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
SGVector< float64_t > m_W
The Kernel base class.
#define SG_WARNING(...)
Definition: SGIO.h:127
#define SG_ADD(...)
Definition: SGObject.h:93
static CMultiLaplaceInferenceMethod * obtain_from_generic(CInference *inference)
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)
CLikelihoodModel * m_model
Definition: Inference.h:470
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:296
The Likelihood model base class.
index_t vlen
Definition: SGVector.h:571
SGVector< float64_t > m_alpha
Definition: Inference.h:479
virtual void check_members() const
Definition: Inference.cpp:249

SHOGUN Machine Learning Toolbox - Documentation