NegativeGaussianProcessEvidence.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Evidence for model selection of a regularization network/Gaussian process.
6
7
8 *
9 *
10 * \author C. Igel, T. Glasmachers, O. Krause
11 * \date 2007-2012
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36#ifndef SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
37#define SHARK_OBJECTIVEFUNCTIONS_NEGATIVEGAUSSIANPROCESSEVIDENCE_H
38
41
42#include <shark/LinAlg/Base.h>
43namespace shark {
44
45
46///
47/// \brief Evidence for model selection of a regularization network/Gaussian process.
48///
49/// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
50/// \f$t\f$ the corresponding label vector. For the evidence we have:
51/// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi)] \f]
52///
53/// The evidence is also known as marginal (log)likelihood. For
54/// details, please see:
55///
56/// C.E. Rasmussen & C.K.I. Williams, Gaussian
57/// Processes for Machine Learning, section 5.4, MIT Press, 2006
58///
59/// C.M. Bishop, Pattern Recognition and Machine Learning, section
60/// 6.4.3, Springer, 2006
61///
62/// The regularization parameter can be encoded in different ways.
63/// The exponential encoding is the proper choice for unconstraint optimization.
64/// Be careful not to mix up different encodings between trainer and evidence.
65/// \ingroup kerneloptimization
66template<class InputType = RealVector, class OutputType = RealVector, class LabelType = RealVector>
68{
69public:
72
73 /// \param dataset: training data for the Gaussian process
74 /// \param kernel: pointer to external kernel function
75 /// \param unconstrained: exponential encoding of regularization parameter for unconstraint optimization
77 DatasetType const& dataset,
78 KernelType* kernel,
79 bool unconstrained = false
80 ): m_dataset(dataset)
81 , mep_kernel(kernel)
82 , m_unconstrained(unconstrained)
83 {
85 setThreshold(0.);
86 }
87
88 /// \brief From INameable: return the class name.
89 std::string name() const
90 { return "NegativeGaussianProcessEvidence"; }
91
92 std::size_t numberOfVariables()const{
93 return 1+ mep_kernel->numberOfParameters();
94 }
95
96 /// Let \f$M\f$ denote the (kernel Gram) covariance matrix and
97 /// \f$t\f$ the label vector. For the evidence we have: \f[ E= 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
98 double eval(const RealVector& parameters) const {
99 std::size_t N = m_dataset.numberOfElements();
100 std::size_t kp = mep_kernel->numberOfParameters();
101 // check whether argument has right dimensionality
102 SHARK_ASSERT(1+kp == parameters.size());
103
104 // keep track of how often the objective function is called
106
107 //set parameters
108 double betaInv = parameters.back();
109 if(m_unconstrained)
110 betaInv = std::exp(betaInv); // for unconstraint optimization
111 mep_kernel->setParameterVector(subrange(parameters,0,kp));
112
113
114 //generate kernel matrix and label vector
115 RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
116 RealMatrix t = createBatch<RealVector>(m_dataset.labels().elements());
117
118 blas::cholesky_decomposition<RealMatrix> cholesky(M);
119
120 //compute the determinant of M using the cholesky factorization M=AA^T:
121 //ln det(M) = 2 trace(ln A)
122 double logDet = 2* trace(log(cholesky.lower_factor()));
123
124 //we need to compute t^T M^-1 t
125 //= t^T (AA^T)^-1 t= t^T (A^-T A^-1)=||A^-1 t||^2
126 //so we will first solve the triangular System Az=t
127 //and then compute ||z||^2
128 RealMatrix z = solve(cholesky.lower_factor(),t,blas::lower(), blas::left());
129
130 // equation (6.69) on page 311 in the book C.M. Bishop, Pattern Recognition and Machine Learning, Springer, 2006
131 // e = 1/2 \cdot [ -log(det(M)) - t^T M^{-1} t - N log(2 \pi) ]
132 double e = 0.5 * (-logDet - norm_sqr(to_vector(z)) - N * std::log(2.0 * M_PI));
133
134 // return the *negative* evidence
135 return -e;
136 }
137
138 /// Let \f$M\f$ denote the regularized (kernel Gram) covariance matrix.
139 /// For the evidence we have:
140 /// \f[ E = 1/2 \cdot [ -\log(\det(M)) - t^T M^{-1} t - N \log(2 \pi) ] \f]
141 /// For a kernel parameter \f$p\f$ and \f$C = \beta^{-1}\f$ we get the derivatives:
142 /// \f[ dE/dC = 1/2 \cdot [ -tr(M^{-1}) + (M^{-1} t)^2 ] \f]
143 /// \f[ dE/dp = 1/2 \cdot [ -tr(M^{-1} dM/dp) + t^T (M^{-1} dM/dp M^{-1}) t ] \f]
144 double evalDerivative(const RealVector& parameters, FirstOrderDerivative& derivative) const {
145 std::size_t N = m_dataset.numberOfElements();
146 std::size_t kp = mep_kernel->numberOfParameters();
147
148 // check whether argument has right dimensionality
149 SHARK_ASSERT(1 + kp == parameters.size());
150 derivative.resize(1 + kp);
151
152 // keep track of how often the objective function is called
154
155 //set parameters
156 double betaInv = parameters.back();
157 if(m_unconstrained)
158 betaInv = std::exp(betaInv); // for unconstraint optimization
159 mep_kernel->setParameterVector(subrange(parameters,0,kp));
160
161 //generate kernel matrix and label vector
162 RealMatrix M = calculateRegularizedKernelMatrix(*mep_kernel,m_dataset.inputs(),betaInv);
163 RealMatrix t = createBatch<RealVector>(m_dataset.labels().elements());
164
165 //compute cholesky decomposition of M
166 blas::cholesky_decomposition<RealMatrix> cholesky(M);
167 //we dont need M anymore, so save a lot of memory by freeing the memory of M
168 M=RealMatrix();
169
170 // compute derivative w.r.t. kernel parameters
171 //the derivative is defined as:
172 //dE/da = -tr(IM dM/da) +t^T IM dM/da IM t
173 // where IM is the inverse matrix of M, tr is the trace and a are the parameters of the kernel
174 //by substituting z = IM t we can expand the operations to:
175 //dE/da = -(sum_i sum_j IM_ij * dM_ji/da)+(sum_i sum_j dM_ij/da *z_i * z_j)
176 // = sum_i sum_j (-IM_ij+z_i * z_j) * dM_ij/da
177 // with W = -IM + zz^T we get
178 // dE/da = sum_i sum_j W dM_ij/da
179 //this can be calculated as blockwise derivative.
180
181 //compute inverse matrix from the cholesky decomposition
182 RealMatrix W= blas::identity_matrix<double>(N);
183 cholesky.solve(W,blas::left());
184
185 //calculate z = Wt=M^-1 t
186 RealMatrix z = prod(W,t);
187
188 // W is already initialized as the inverse of M, so we only need
189 // to change the sign and add z. to calculate W fully
190 W*=-1;
191 noalias(W) += prod(z,trans(z));
192
193
194 //now calculate the derivative
195 RealVector kernelGradient = 0.5*calculateKernelMatrixParameterDerivative(*mep_kernel,m_dataset.inputs(),W);
196
197 // compute derivative w.r.t. regularization parameter
198 //we have: dE/dC = 1/2 * [ -tr(M^{-1}) + (M^{-1} t)^2
199 // which can also be written as 1/2 tr(W)
200 double betaInvDerivative = 0.5 * trace(W) ;
201 if(m_unconstrained)
202 betaInvDerivative *= betaInv;
203
204 //merge both derivatives and since we return the negative evidence, multiply with -1
205 noalias(derivative) = - (kernelGradient | betaInvDerivative);
206
207 // truncate gradient vector
208 for(std::size_t i=0; i<derivative.size(); i++)
209 if(std::abs(derivative(i)) < m_derivativeThresholds(i)) derivative(i) = 0;
210
211 // compute the evidence
212 //compute determinant of M (see eval for why this works)
213 double logDetM = 2* trace(log(cholesky.lower_factor()));
214 double e = 0.5 * (-logDetM - inner_prod(to_vector(t), to_vector(z)) - N * std::log(2.0 * M_PI));
215 return -e;
216 }
217
218 /// set threshold value for truncating partial derivatives
219 void setThreshold(double d) {
220 m_derivativeThresholds = RealVector(mep_kernel->numberOfParameters() + 1, d); // plus one parameter for the prior
221 }
222
223 /// set threshold values for truncating partial derivatives
224 void setThresholds(RealVector &c) {
225 SHARK_ASSERT(m_derivativeThresholds.size() == c.size());
226 m_derivativeThresholds = c;
227 }
228
229
230private:
231 /// pointer to external data set
232 DatasetType m_dataset;
233
234 /// thresholds for setting derivatives to zero
235 RealVector m_derivativeThresholds;
236
237 /// pointer to external kernel function
238 KernelType* mep_kernel;
239
240 /// Indicates whether log() of the regularization parameter is
241 /// considered. This is useful for unconstraint
242 /// optimization. The default value is false.
243 bool m_unconstrained;
244};
245
246
247}
248#endif