KernelSGDTrainer.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Generic stochastic gradient descent training for kernel-based models.
6 *
7 *
8 *
9 *
10 * \author T. Glasmachers
11 * \date 2013
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36
37#ifndef SHARK_ALGORITHMS_KERNELSGDTRAINER_H
38#define SHARK_ALGORITHMS_KERNELSGDTRAINER_H
39
40
48
49
50namespace shark
51{
52
53
54///
55/// \brief Generic stochastic gradient descent training for kernel-based models.
56///
57/// Given a differentiable loss function L(f, y) for classification
58/// this trainer solves the regularized risk minimization problem
59/// \f[
60/// \min \frac{1}{2} \sum_j \|w_j\|^2 + C \sum_i L(y_i, f(x_i)),
61/// \f]
62/// where i runs over training data, j over classes, and C > 0 is the
63/// regularization parameter.
64///
65/// \par
66/// This implementation is an adaptation of the PEGASOS algorithm, see the paper
67/// <i>Shalev-Shwartz et al. "Pegasos: Primal estimated sub-gradient solver for SVM." Mathematical Programming 127.1 (2011): 3-30.</i><br/><br/>
68/// However, the (non-essential) projection step is dropped, and the
69/// algorithm is applied to a kernelized model. The resulting
70/// optimization scheme amounts to plain standard stochastic gradient
71/// descent (SGD) with update steps of the form
72/// \f[
73/// w_j \leftarrow (1 - 1/t) w_j + \frac{C}{t} \frac{\partial L(y_i, f(x_i))}{\partial w_j}
74/// \f]
75/// for random index i. The only notable trick borrowed from that paper
76/// is the representation of the weight vectors in the form
77/// \f[
78/// w_j = s \cdot \sum_i \alpha_{i,j} k(x_i, \cdot)
79/// \f]
80/// with a scalar factor s (called alphaScale in the code). This enables
81/// scaling with factor (1 - 1/t) in constant time.
82///
83/// \par
84/// NOTE: Being an SGD-based solver, this algorithm is relatively fast for
85/// differentiable loss functions such as the logistic loss (class CrossEntropy).
86/// It suffers from significantly slower convergence for non-differentiable
87/// losses, e.g., the hinge loss for SVM training.
88/// \ingroup supervised_trainer
89template <class InputType, class CacheType = float>
90class KernelSGDTrainer : public AbstractTrainer< KernelClassifier<InputType> >, public IParameterizable<>
91{
92public:
99 typedef CacheType QpFloatType;
100
103
104
105 /// \brief Constructor
106 ///
107 /// \param kernel kernel function to use for training and prediction
108 /// \param loss (sub-)differentiable loss function
109 /// \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
110 /// \param offset whether to train with offset/bias parameter or not
111 /// \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
112 /// \param cacheSize size of the cache
113 KernelSGDTrainer(KernelType* kernel, const LossType* loss, double C, bool offset, bool unconstrained = false, size_t cacheSize = 0x4000000)
115 , m_loss(loss)
116 , m_C(C)
117 , m_offset(offset)
118 , m_unconstrained(unconstrained)
119 , m_epochs(0)
121 { }
122
123
124 /// return current cachesize
125 double cacheSize() const
126 {
127 return m_cacheSize;
128 }
129
130
131 void setCacheSize(std::size_t size)
132 {
133 m_cacheSize = size;
134 }
135
136 /// \brief From INameable: return the class name.
137 std::string name() const
138 { return "KernelSGDTrainer"; }
139
141 {
142 std::size_t ell = dataset.numberOfElements();
143 std::size_t classes = numberOfClasses(dataset);
144 ModelType& model = classifier.decisionFunction();
145
146 model.setStructure(m_kernel, dataset.inputs(), m_offset, classes);
147
148 RealMatrix& alpha = model.alpha();
149
150 // pre-compute the kernel matrix (may change in the future)
151 // and create linear array of labels
152 KernelMatrixType km(*(this->m_kernel), dataset.inputs());
154 UIntVector y = createBatch(dataset.labels().elements());
155 const double lambda = 0.5 / (ell * m_C);
156
157 double alphaScale = 1.0;
158 std::size_t iterations;
159 if(m_epochs == 0) iterations = std::max(10 * ell, std::size_t(std::ceil(m_C * ell)));
160 else iterations = m_epochs * ell;
161
162 // preinitialize everything to prevent costly memory allocations in the loop
163 RealVector f_b(classes, 0.0);
164 RealVector derivative(classes, 0.0);
165
166 // SGD loop
167 blas::vector<QpFloatType> kernelRow(ell, 0);
168 for(std::size_t iter = 0; iter < iterations; iter++)
169 {
170 // active variable
171 std::size_t b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
172
173 // learning rate
174 const double eta = 1.0 / (lambda * (iter + ell));
175
176 // compute prediction
177 K.row(b, kernelRow);
178 noalias(f_b) = alphaScale * prod(trans(alpha), kernelRow);
179 if(m_offset) noalias(f_b) += model.offset();
180
181 // stochastic gradient descent (SGD) step
182 derivative.clear();
183 m_loss->evalDerivative(y[b], f_b, derivative);
184
185 // alphaScale *= (1.0 - eta * lambda);
186 alphaScale = (ell - 1.0) / (ell + iter); // numerically more stable
187
188 noalias(row(alpha, b)) -= (eta / alphaScale) * derivative;
189 if(m_offset) noalias(model.offset()) -= eta * derivative;
190 }
191
192 alpha *= alphaScale;
193
194 // model.sparsify();
195 }
196
197 /// Return the number of training epochs.
198 /// A value of 0 indicates that the default of max(10, C) should be used.
199 std::size_t epochs() const
200 { return m_epochs; }
201
202 /// Set the number of training epochs.
203 /// A value of 0 indicates that the default of max(10, C) should be used.
204 void setEpochs(std::size_t value)
205 { m_epochs = value; }
206
207 /// get the kernel function
209 { return m_kernel; }
210 /// get the kernel function
211 const KernelType* kernel() const
212 { return m_kernel; }
213 /// set the kernel function
216
217 /// check whether the parameter C is represented as log(C), thus,
218 /// in a form suitable for unconstrained optimization, in the
219 /// parameter vector
220 bool isUnconstrained() const
221 { return m_unconstrained; }
222
223 /// return the value of the regularization parameter
224 double C() const
225 { return m_C; }
226
227 /// set the value of the regularization parameter (must be positive)
228 void setC(double value)
229 {
230 RANGE_CHECK(value > 0.0);
231 m_C = value;
232 }
233
234 /// check whether the model to be trained should include an offset term
235 bool trainOffset() const
236 { return m_offset; }
237
238 ///\brief Returns the vector of hyper-parameters.
239 RealVector parameterVector() const{
240 double parC= m_unconstrained? std::log(m_C): m_C;
241 return m_kernel->parameterVector() | parC;
242 }
243
244 ///\brief Sets the vector of hyper-parameters.
245 void setParameterVector(RealVector const& newParameters){
246 size_t kp = m_kernel->numberOfParameters();
247 SHARK_ASSERT(newParameters.size() == kp + 1);
248 m_kernel->setParameterVector(subrange(newParameters,0,kp));
249 m_C = newParameters.back();
250 if(m_unconstrained) m_C = exp(m_C);
251 }
252
253 ///\brief Returns the number of hyper-parameters.
254 size_t numberOfParameters() const{
255 return m_kernel->numberOfParameters() + 1;
256 }
257
258protected:
259 KernelType* m_kernel; ///< pointer to kernel function
260 const LossType* m_loss; ///< pointer to loss function
261 double m_C; ///< regularization parameter
262 bool m_offset; ///< should the resulting model have an offset term?
263 bool m_unconstrained; ///< should C be stored as log(C) as a parameter?
264 std::size_t m_epochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
265
266 // size of cache to use.
267 std::size_t m_cacheSize;
268
269};
270
271
272}
273#endif