ArdKernel.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Gaussian automatic relevance detection (ARD) kernel
6 *
7 *
8 *
9 * \author T.Glasmachers, O. Krause, M. Tuma
10 * \date 2010-2012
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35#ifndef SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
36#define SHARK_MODELS_KERNELS_GAUSSIAN_ARD_KERNEL_H
37
38
40namespace shark {
41
42
43/// \brief Automatic relevance detection kernel for unconstrained parameter optimization
44///
45/// The automatic relevance detection (ARD) kernel is a general Gaussian kernel with
46/// diagonal covariance matrix:
47/// \f$ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f$.
48/// The ARD kernel holds one real-valued parameter \f$ \gamma_i \f$ per input dimension.
49/// The parameters \f$ p_i \f$ are encoded as \f$ p_i = log(\gamma_i) \f$, allowing for unconstrained
50/// optimization. Here, the exposed/visible parameters are transformed by an exp before being used in the
51/// actual computations.
52///
53/// Note that, like all or most models/kernels designed for unconstrained optimization, the
54/// argument to the constructor corresponds to the value of the true weights, while the set
55/// and get methods for the parameter vector set the parameterized values and not the true weights.
56///
57/// \ingroup kernels
58template<class InputType=RealVector>
60{
61private:
63
64 struct InternalState: public State{
65 RealMatrix kxy;
66
67 void resize(std::size_t sizeX1,std::size_t sizeX2){
68 kxy.resize(sizeX1,sizeX2);
69 }
70 };
71public:
75
76 /// Constructor
77 /// \param dim input dimension
78 /// \param gamma_init initial gamma value for all dimensions (true value, used as passed into ctor)
79 ARDKernelUnconstrained(unsigned int dim, double gamma_init = 1.0){
80 SHARK_RUNTIME_CHECK( gamma_init > 0, "[ARDKernelUnconstrained::ARDKernelUnconstrained] Expected positive weight.");
81
82 //init abstract model's informational flags
86
87 //initialize self
90 for ( unsigned int i=0; i<m_inputDimensions; i++ ){
91 m_gammas(i) = gamma_init;
92 }
93 }
94
95 /// \brief From INameable: return the class name.
96 std::string name() const
97 { return "ARDKernelUnconstrained"; }
98
99 RealVector parameterVector() const{
100 return log(m_gammas);
101 }
102 void setParameterVector(RealVector const& newParameters){
103 SIZE_CHECK(newParameters.size() == m_inputDimensions);
104 noalias(m_gammas) = exp(newParameters);
105 }
106 std::size_t numberOfParameters() const{
107 return m_inputDimensions;
108 }
109
110 ///\brief creates the internal state of the kernel
111 boost::shared_ptr<State> createState()const{
112 return boost::shared_ptr<State>(new InternalState());
113 }
114
115 /// convenience methods for setting/getting the actual gamma values
116 RealVector gammaVector() const{
117 return m_gammas;
118 }
119 void setGammaVector( RealVector const& newGammas ) {
120#ifndef DNDEBUG
121 SIZE_CHECK( newGammas.size() == m_inputDimensions );
122 for ( unsigned int i=0; i<m_inputDimensions; i++ ) {
123 RANGE_CHECK( newGammas(i) > 0 );
124 }
125#endif
126 m_gammas = newGammas;
127 }
128
129 /// \brief evaluates \f$ k(x,z)\f$
130 ///
131 /// ARD kernel evaluation
132 /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
134 SIZE_CHECK(x1.size() == x2.size());
135 SIZE_CHECK(x1.size() == m_inputDimensions);
136 double dmnorm2 = diagonalMahalanobisDistanceSqr(x1, x2, m_gammas);
137 return std::exp(-dmnorm2);
138 }
139
140 /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
141 ///
142 /// ARD kernel evaluation
143 /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
144 void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{
145 SIZE_CHECK(batchX1.size2() == batchX2.size2());
146 SIZE_CHECK(batchX1.size2() == m_inputDimensions);
147
148 std::size_t sizeX1 = batchX1.size1();
149 std::size_t sizeX2 = batchX2.size1();
150
151 ensure_size(result,sizeX1,sizeX2);
152 //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
153 for(std::size_t i = 0; i != sizeX1; ++i){
154 for(std::size_t j = 0; j != sizeX2; ++j){
155 double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
156 result(i,j)=std::exp(-dmnorm2);
157 }
158 }
159 }
160
161 /// \brief evaluates \f$ k(x,z)\f$ for a whole batch
162 ///
163 /// ARD kernel evaluation
164 /// \f[ k(x, z) = \exp(-\sum_i \gamma_i (x_i - z_i)^2) \f]
165 void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{
166 SIZE_CHECK(batchX1.size2() == batchX2.size2());
167 SIZE_CHECK(batchX1.size2() == m_inputDimensions);
168
169 std::size_t sizeX1 = batchX1.size1();
170 std::size_t sizeX2 = batchX2.size1();
171
172 InternalState& s = state.toState<InternalState>();
173 s.resize(sizeX1,sizeX2);
174
175 ensure_size(result,sizeX1,sizeX2);
176 //todo: implement fast version of diagonalMahalanobisDistanceSqr for matrices
177 for(std::size_t i = 0; i != sizeX1; ++i){
178 for(std::size_t j = 0; j != sizeX2; ++j){
179 double dmnorm2 = diagonalMahalanobisDistanceSqr(row(batchX1,i), row(batchX2,j), m_gammas);
180 result(i,j) = std::exp(-dmnorm2);
181 s.kxy(i,j) = result(i,j);
182 }
183 }
184 }
185
186 /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial \sqrt{\gamma_i}}\f$ weighted over a whole batch
187 ///
188 /// Since the ARD kernel is parametrized for unconstrained optimization, we return
189 /// the derivative w.r.t. the parameters \f$ p_i \f$, where \f$ p_i^2 = \gamma_i \f$.
190 ///
191 /// \f[ \frac {\partial k(x,z)}{\partial p_i} = -2 p_i (x_i - z_i)^2 \cdot k(x,z) \f]
195 RealMatrix const& coefficients,
196 State const& state,
197 RealVector& gradient
198 ) const{
199 SIZE_CHECK(batchX1.size2() == batchX2.size2());
200 SIZE_CHECK(batchX1.size2() == m_inputDimensions);
201
202 std::size_t sizeX1 = batchX1.size1();
203 std::size_t sizeX2 = batchX2.size1();
204
205 ensure_size(gradient, m_inputDimensions );
206 gradient.clear();
207 InternalState const& s = state.toState<InternalState>();
208
209 for(std::size_t i = 0; i != sizeX1; ++i){
210 for(std::size_t j = 0; j != sizeX2; ++j){
211 double coeff = coefficients(i,j) * s.kxy(i,j);
212 gradient -= coeff * m_gammas * sqr(row(batchX1,i)-row(batchX2,j));
213 }
214 }
215 }
216
217 /// \brief evaluates \f$ \frac {\partial k(x,z)}{\partial x}\f$
218 ///
219 /// first derivative of ARD kernel wrt the first input pattern
220 /// \f[ \frac {\partial k(x,z)}{\partial x} = -2 \gamma_i \left( x_i - z_i \right)\cdot k(x,z) \f]
224 RealMatrix const& coefficientsX2,
225 State const& state,
226 BatchInputType& gradient
227 ) const{
228 SIZE_CHECK(batchX1.size2() == batchX2.size2());
229 SIZE_CHECK(batchX1.size2() == m_inputDimensions);
230
231 std::size_t sizeX1 = batchX1.size1();
232 std::size_t sizeX2 = batchX2.size1();
233
234 InternalState const& s = state.toState<InternalState>();
235 ensure_size(gradient, sizeX1, m_inputDimensions );
236 gradient.clear();
237
238 for(std::size_t i = 0; i != sizeX1; ++i){
239 for(std::size_t j = 0; j != sizeX2; ++j){
240 double coeff = coefficientsX2(i,j) * s.kxy(i,j);
241 row(gradient,i) += coeff * m_gammas * (row(batchX1,i)-row(batchX2,j));
242 }
243 }
244 gradient *= -2.0;
245 }
246
247 void read(InArchive& ar){
248 ar >> m_gammas;
249 ar >> m_inputDimensions;
250 }
251
252 void write(OutArchive& ar) const{
253 ar << m_gammas;
254 ar << m_inputDimensions;
255 }
256
257protected:
258 RealVector m_gammas; ///< kernel bandwidth parameters, one for each input dimension.
259 std::size_t m_inputDimensions; ///< how many input dimensions = how many bandwidth parameters
260};
261
264
265}
266#endif