KernelTargetAlignment.h
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
5 * \file
6 *
7 *
8 * \author T. Glasmachers, O.Krause
9 * \date 2010-2012
10 *
11 *
12 * \par Copyright 1995-2017 Shark Development Team
13 *
14 * <BR><HR>
15 * This file is part of Shark.
16 * <https://shark-ml.github.io/Shark/>
17 *
18 * Shark is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU Lesser General Public License as published
20 * by the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * Shark is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU Lesser General Public License for more details.
27 *
28 * You should have received a copy of the GNU Lesser General Public License
29 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30 *
31 */
32#ifndef SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
33#define SHARK_OBJECTIVEFUNCTIONS_KERNELTARGETALIGNMENT_H
34
36#include <shark/Data/Dataset.h>
39
40
41namespace shark{
42
43
44/// \defgroup kerneloptimization Kernel Optimization
45/// \ingroup objfunctions
46/// \ingroup kernels
47/// \brief All kinds of objective functions to optimize kernel functions.
48///
49
50/// \brief Kernel Target Alignment - a measure of alignment of a kernel Gram matrix with labels.
51///
52///The Kernel Target Alignment (KTA) was originally proposed in the paper:<br/>
53///On Kernel-Target Alignment. N. Cristianini, J. Shawe-Taylor,
54///A. Elisseeff, J. Kandola. Innovations in Machine Learning, 2006.<br/>
55///Here we provide a version with centering of the features as proposed
56///in the paper:<br/>
57///Two-Stage Learning Kernel Algorithms. C. Cortes, M. Mohri,
58///A. Rostamizadeh. ICML 2010.<br/>
59///
60///The kernel target alignment is defined as
61///where K is the kernel Gram matrix of the data and y is the vector of
62///\f[ \hat A = \frac{\langle K, y y^T \rangle}{\sqrt{\langle K, K \rangle \cdot \langle y y^T, y y^T \rangle}} \f]
63///+1/-1 valued labels. The outer product \f$y y^T\f$ corresponds to
64///an ideal Gram matrix corresponding to a kernel that maps
65///the two classes each to a single point, thus minimizing within-class
66///distance for fixed inter-class distance. The inner products denote the
67///Frobenius product of matrices:
68///http://en.wikipedia.org/wiki/Matrix_multiplication#Frobenius_product
69///
70///In kernel-based learning, the kernel Gram matrix \f$K\f$ is of the form
71///\f[ K_{i,j} = k(x_i, x_j) = \langle \phi(x_i), \phi(x_j) \rangle \f]
72///for a Mercer kernel function k and inputs \f$x_i, x_j\f$. In this
73///version of the KTA we use centered feature vectors. Let
74///\f[ \psi(x_i) = \phi(x_i) - \frac{1}{\ell} \sum_{j=1}^{\ell} \phi(x_j) \f]
75///denote the centered feature vectors, then the centered Gram matrix \f$K^c\f$ is given by
76///\f[ K^c_{i,j} = \langle \psi(x_i), \psi(x_j) \rangle = K_{i,j} - \frac{1}{\ell} \sum_{n=1}^\ell K_{i,n} + K_{j,n} + \frac{1}{\ell^2} \sum_{m,n=1}^\ell K_{n,m} \f]
77///The alignment measure computed by this class is the exact same formula
78///for \f$ \hat A \f$, but with \f$K^c\f$ plugged in in place of \f$K\f$.
79///
80///KTA measures the Frobenius inner product between a kernel Gram matrix
81///and this ideal matrix. The interpretation is that KTA measures how
82///well a given kernel fits a classification problem. The actual measure
83///is invariant under kernel rescaling.
84///In Shark, objective functions are minimized by convention. Therefore
85///the negative alignment \f$- \hat A\f$ is implemented. The measure is
86///extended for multi-class problems by using prototype vectors instead
87///of scalar labels.
88///
89///The following properties of KTA are important from a model selection
90///point of view: it is relatively fast and easy to compute, it is
91///differentiable w.r.t. the kernel function, and it is independent of
92///the actual classifier.
93/// \ingroup kerneloptimization
94template<class InputType = RealVector,class LabelType = unsigned int>
95class KernelTargetAlignment : public AbstractObjectiveFunction< RealVector, double >
96{
97private:
98 typedef typename Batch<LabelType>::type BatchLabelType;
99public:
100 /// \brief Construction of the Kernel Target Alignment (KTA) from a kernel object.
104 bool centering = true
105 ){
106 SHARK_RUNTIME_CHECK(kernel != NULL, "[KernelTargetAlignment] kernel must not be NULL");
107
108 mep_kernel = kernel;
109
112
113 if(mep_kernel -> hasFirstParameterDerivative())
115
116 m_data = dataset;
117 m_elements = dataset.numberOfElements();
118 m_centering = centering;
119
120 setupY(dataset.labels(), centering);
121 }
122
123 /// \brief From INameable: return the class name.
124 std::string name() const
125 { return "KernelTargetAlignment"; }
126
127 /// Return the current kernel parameters as a starting point for an optimization run.
129 return mep_kernel -> parameterVector();
130 }
131
132
133 std::size_t numberOfVariables()const{
134 return mep_kernel->numberOfParameters();
135 }
136
137 /// \brief Evaluate the (centered, negative) Kernel Target Alignment (KTA).
138 ///
139 /// See the class description for more details on this computation.
140 double eval(RealVector const& input) const{
141 mep_kernel->setParameterVector(input);
142
143 return -evaluateKernelMatrix().error;
144 }
145
146 /// \brief Compute the derivative of the KTA as a function of the kernel parameters.
147 ///
148 /// It holds:
149 /// \f[ \langle K^c, K^c \rangle = \langle K,K \rangle -2 \ell \langle k,k \rangle + mk^2 \ell^2 \\
150 /// (\langle K^c, K^c \rangle )' = 2 \langle K,K' \rangle -4\ell \langle k, \frac{1}{\ell} \sum_j K'_ij \rangle +2 \ell^2 mk \sum_ij 1/(\ell^2) K'_ij \\
151 /// = 2 \langle K,K' \rangle -4 \langle k, \sum_j K'_ij \rangle + 2 mk \sum_ij K_ij \\
152 /// = 2 \langle K,K' \rangle -4 \langle k u^T, K' \rangle + 2 mk \langle u u^T, K' \rangle \\
153 /// = 2\langle K -2 k u^T + mk u u^T, K' \rangle ) \\
154 /// \langle Y, K^c \rangle = \langle Y, K \rangle - 2 n \langle y, k \rangle + n^2 my mk \\
155 /// (\langle Y , K^c \rangle )' = \langle Y -2 y u^T + my u u^T, K' \rangle \f]
156 /// now the derivative is computed from this values in a second sweep over the data:
157 /// we get:
158 /// \f[ \hat A' = 1/\langle K^c,K^c \rangle ^{3/2} (\langle K^c,K^c \rangle (\langle Y,K^c \rangle )' - 0.5*\langle Y, K^c \rangle (\langle K^c , K^c \rangle )') \\
159 /// = 1/\langle K^c,K^c \rangle ^{3/2} \langle \langle K^c,K^c \rangle (Y -2 y u^T + my u u^T)- \langle Y, K^c \rangle (K -2 k u^T+ mk u u^T),K' \rangle \\
160 /// = 1/\langle K^c,K^c \rangle ^{3/2} \langle W,K' \rangle \f]
161 ///reordering rsults in
162 /// \f[ W= \langle K^c,K^c \rangle Y - \langle Y, K^c \rangle K \\
163 /// - 2 (\langle K^c,K^c \rangle y - \langle Y, K^c \rangle k) u^T \\
164 /// + (\langle K^c,K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
165 /// where \f$ K' \f$ is the derivative of K with respct of the kernel parameters.
166 ResultType evalDerivative( const SearchPointType & input, FirstOrderDerivative & derivative ) const {
167 mep_kernel->setParameterVector(input);
168 // the drivative is calculated in two sweeps of the data. first the required statistics
169 // \langle K^c,K^c \rangle , mk and k are evaluated exactly as in eval
170
171 KernelMatrixResults results = evaluateKernelMatrix();
172
173 std::size_t parameters = mep_kernel->numberOfParameters();
174 derivative.resize(parameters);
175 derivative.clear();
176 SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
177 std::size_t startX = 0;
178 for(int j = 0; j != i; ++j){
179 startX+= batchSize(m_data.batch(j));
180 }
181 RealVector threadDerivative(parameters,0.0);
182 RealVector blockDerivative;
183 boost::shared_ptr<State> state = mep_kernel->createState();
184 RealMatrix blockK;//block of the KernelMatrix
185 RealMatrix blockW;//block of the WeightMatrix
186 std::size_t startY = 0;
187 for(int j = 0; j <= i; ++j){
188 mep_kernel->eval(m_data.batch(i).input,m_data.batch(j).input,blockK,*state);
189 mep_kernel->weightedParameterDerivative(
190 m_data.batch(i).input,m_data.batch(j).input,
191 generateDerivativeWeightBlock(i,j,startX,startY,blockK,results),//takes symmetry into account
192 *state,
193 blockDerivative
194 );
195 noalias(threadDerivative) += blockDerivative;
196 startY += batchSize(m_data.batch(j));
197 }
199 noalias(derivative) += threadDerivative;
200 }
201 }
202 derivative *= -1;
203 derivative /= m_elements;
204 return -results.error;
205 }
206
207private:
208 AbstractKernelFunction<InputType>* mep_kernel; ///< kernel function
209 LabeledData<InputType,LabelType> m_data; ///< data points
210 RealVector m_columnMeanY; ///< mean label vector
211 double m_meanY; ///< mean label element
212 std::size_t m_numberOfClasses; ///< number of classes
213 std::size_t m_elements; ///< number of data points
214 bool m_centering;
215
216 struct KernelMatrixResults{
217 RealVector k;
218 double KcKc;
219 double YcKc;
220 double error;
221 double meanK;
222 };
223
224 void setupY(Data<unsigned int>const& labels, bool centering){
225 //preprocess Y so calculate column means and overall mean
226 //the most efficient way to do this is via the class counts
227 std::vector<std::size_t> classCount = classSizes(labels);
228 m_numberOfClasses = classCount.size();
229 RealVector classMean(m_numberOfClasses);
230 double dm1 = m_numberOfClasses-1.0;
231 m_meanY = 0;
232 for(std::size_t i = 0; i != m_numberOfClasses; ++i){
233 classMean(i) = classCount[i]-(m_elements-classCount[i])/dm1;
234 m_meanY += classCount[i] * classMean(i);
235 }
236 classMean /= m_elements;
237 m_meanY /= sqr(double(m_elements));
238 m_columnMeanY.resize(m_elements);
239 for(std::size_t i = 0; i != m_elements; ++i){
240 m_columnMeanY(i) = classMean(labels.element(i));
241 }
242 if(!centering){
243 m_meanY = 0;
244 m_columnMeanY.clear();
245 }
246 }
247
248 void setupY(Data<RealVector>const& labels, bool centering){
249 RealVector meanLabel = mean(labels);
250 m_columnMeanY.resize(m_elements);
251 for(std::size_t i = 0; i != m_elements; ++i){
252 m_columnMeanY(i) = inner_prod(labels.element(i),meanLabel);
253 }
254 m_meanY=inner_prod(meanLabel,meanLabel);
255 if(!centering){
256 m_meanY = 0;
257 m_columnMeanY.clear();
258 }
259 }
260 void computeBlockY(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix& blockY)const{
261 std::size_t blockSize1 = labelsi.size();
262 std::size_t blockSize2 = labelsj.size();
263 double dm1 = m_numberOfClasses-1.0;
264 for(std::size_t k = 0; k != blockSize1; ++k){
265 for(std::size_t l = 0; l != blockSize2; ++l){
266 if( labelsi(k) == labelsj(l))
267 blockY(k,l) = 1;
268 else
269 blockY(k,l) = -1.0/dm1;
270 }
271 }
272 }
273
274 void computeBlockY(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix& blockY)const{
275 noalias(blockY) = labelsi % trans(labelsj);
276 }
277
278 /// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
279 double updateYK(UIntVector const& labelsi,UIntVector const& labelsj, RealMatrix const& block)const{
280 std::size_t blockSize1 = labelsi.size();
281 std::size_t blockSize2 = labelsj.size();
282 //todo optimize the i=j case
283 double result = 0;
284 double dm1 = m_numberOfClasses-1.0;
285 for(std::size_t k = 0; k != blockSize1; ++k){
286 for(std::size_t l = 0; l != blockSize2; ++l){
287 if(labelsi(k) == labelsj(l))
288 result += block(k,l);
289 else
290 result -= block(k,l)/dm1;
291 }
292 }
293 return result;
294 }
295
296 /// Update a sub-block of the matrix \f$ \langle Y, K^x \rangle \f$.
297 double updateYK(RealMatrix const& labelsi,RealMatrix const& labelsj, RealMatrix const& block)const{
298 RealMatrix Y(labelsi.size1(), labelsj.size1());
299 computeBlockY(labelsi,labelsj,Y);
300 return sum(Y * block);
301 }
302
303 /// Compute a sub-block of the matrix
304 /// \f[ W = \langle K^c, K^c \rangle Y - \langle Y, K^c \rangle K -2 (\langle K^c, K^c \rangle y - \langle Y, K^c \rangle k) u^T + (\langle K^c, K^c \rangle my - \langle Y, K^c \rangle mk) u u^T \f]
305 RealMatrix generateDerivativeWeightBlock(
306 std::size_t i, std::size_t j,
307 std::size_t start1, std::size_t start2,
308 RealMatrix const& blockK,
309 KernelMatrixResults const& matrixStatistics
310 )const{
311 std::size_t blockSize1 = batchSize(m_data.batch(i));
312 std::size_t blockSize2 = batchSize(m_data.batch(j));
313 //double n = m_elements;
314 double KcKc = matrixStatistics.KcKc;
315 double YcKc = matrixStatistics.YcKc;
316 double meanK = matrixStatistics.meanK;
317 RealMatrix blockW(blockSize1,blockSize2);
318
319 //first calculate \langle Kc,Kc \rangle Y.
320 computeBlockY(m_data.batch(i).label,m_data.batch(j).label,blockW);
321 blockW *= KcKc;
322 //- \langle Y,K^c \rangle K
323 blockW-=YcKc*blockK;
324 // -2(\langle Kc,Kc \rangle y -\langle Y, K^c \rangle k) u^T
325 // implmented as: -(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k) u^T -u^T(\langle K^c,K^c \rangle y -2\langle Y, K^c \rangle k)^T
326 //todo find out why this is correct and the calculation above is not.
327 blockW-=repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start2,start2+blockSize2),blockSize1);
328 blockW-=trans(repeat(subrange(KcKc*m_columnMeanY - YcKc*matrixStatistics.k,start1,start1+blockSize1),blockSize2));
329 // + (\langle Kc,Kc \rangle my-2\langle Y, Kc \rangle mk) u u^T
330 blockW+= KcKc*m_meanY-YcKc*meanK;
331 blockW /= KcKc*std::sqrt(KcKc);
332 //symmetry
333 if(i != j)
334 blockW *= 2.0;
335 return blockW;
336 }
337
338 /// \brief Evaluate the centered kernel Gram matrix.
339 ///
340 /// The computation is as follows. By means of a
341 /// number of identities it holds
342 /// \f[ \langle K^c, K^c \rangle = \\
343 /// \langle K^c, K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2 \\
344 /// \langle K^c, Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my \f]
345 /// where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
346 /// and n the number of elements
347 KernelMatrixResults evaluateKernelMatrix()const{
348 //it holds
349 // \langle K^c,K^c \rangle = \langle K,K \rangle -2n\langle k,k \rangle +mk^2n^2
350 // \langle K^c,Y \rangle = \langle K, Y \rangle - 2 n \langle k, y \rangle + n^2 mk my
351 // where k is the row mean over K and y the row mean over y, mk, my the total means of K and Y
352 // and n the number of elements
353
354 double KK = 0; //stores \langle K,K \rangle
355 double YK = 0; //stores \langle Y,K^c \rangle
356 RealVector k(m_elements,0.0);//stores the row/column means of K
357 SHARK_PARALLEL_FOR(int i = 0; i < (int)m_data.numberOfBatches(); ++i){
358 std::size_t startRow = 0;
359 for(int j = 0; j != i; ++j){
360 startRow+= batchSize(m_data.batch(j));
361 }
362 std::size_t rowSize = batchSize(m_data.batch(i));
363 double threadKK = 0;
364 double threadYK = 0;
365 RealVector threadk(m_elements,0.0);
366 std::size_t startColumn = 0; //starting column of the current block
367 for(int j = 0; j <= i; ++j){
368 std::size_t columnSize = batchSize(m_data.batch(j));
369 RealMatrix blockK = (*mep_kernel)(m_data.batch(i).input,m_data.batch(j).input);
370 if(i == j){
371 threadKK += frobenius_prod(blockK,blockK);
372 subrange(threadk,startColumn,startColumn+columnSize)+=sum(as_columns(blockK));//update sum_rows(K)
373 threadYK += updateYK(m_data.batch(i).label,m_data.batch(j).label,blockK);
374 }
375 else{//use symmetry ok K
376 threadKK += 2.0 * frobenius_prod(blockK,blockK);
377 subrange(threadk,startColumn,startColumn+columnSize)+=sum(as_columns(blockK));
378 subrange(threadk,startRow,startRow+rowSize)+=sum(as_rows(blockK));//symmetry: block(j,i)
379 threadYK += 2.0 * updateYK(m_data.batch(i).label,m_data.batch(j).label,blockK);
380 }
381 startColumn+=columnSize;
382 }
384 KK += threadKK;
385 YK +=threadYK;
386 noalias(k) +=threadk;
387 }
388 }
389 //calculate the error
390 double n = (double)m_elements;
391 k /= n;//means
392 double meanK = sum(k)/n;
393 if(!m_centering){
394 k.clear();
395 meanK = 0;
396 }
397 double n2 = sqr(n);
398 double YcKc = YK-2.0*n*inner_prod(k,m_columnMeanY)+n2*m_meanY*meanK;
399 double KcKc = KK - 2.0*n*inner_prod(k,k)+n2*sqr(meanK);
400
401 KernelMatrixResults results;
402 results.k=k;
403 results.YcKc = YcKc;
404 results.KcKc = KcKc;
405 results.meanK = meanK;
406 results.error = YcKc/std::sqrt(KcKc)/n;
407 return results;
408 }
409};
410
411
412}
413#endif