KernelBudgetedSGDTrainer.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Budgeted stochastic gradient descent training for kernel-based models.
6 *
7 * \par This is an implementation of the BSGD algorithm, developed by
8 * Wang, Crammer and Vucetic: Breaking the curse of kernelization:
9 * Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
10 * Basically this is pegasos, so something similar to a perceptron. The main
11 * difference is that we do restrict the sparsity of the weight vector to a (currently
12 * predefined) value. Therefore, whenever this sparsity is reached, we have to
13 * decide how to add a new vector to the model, without destroying this
14 * sparsity. Several methods have been proposed for this, Wang et al. main
15 * insight is that merging two budget vectors (i.e. two vectors in the model).
16 * If the first one is searched by norm of its alpha coefficient, the second one
17 * can be found by some optimization problem, yielding a roughly optimal pair.
18 * This pair can be merged and by doing so the budget has now space for a
19 * new vector. Such strategies are called budget maintenance strategies.
20 *
21 * \par This implementation owes much to the 'reference' implementation
22 * in the BudgetedSVM software.
23 *
24 *
25 * \author T. Glasmachers, Aydin Demircioglu
26 * \date 2014
27 *
28 *
29 * \par Copyright 1995-2017 Shark Development Team
30 *
31 * <BR><HR>
32 * This file is part of Shark.
33 * <https://shark-ml.github.io/Shark/>
34 *
35 * Shark is free software: you can redistribute it and/or modify
36 * it under the terms of the GNU Lesser General Public License as published
37 * by the Free Software Foundation, either version 3 of the License, or
38 * (at your option) any later version.
39 *
40 * Shark is distributed in the hope that it will be useful,
41 * but WITHOUT ANY WARRANTY; without even the implied warranty of
42 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
43 * GNU Lesser General Public License for more details.
44 *
45 * You should have received a copy of the GNU Lesser General Public License
46 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
47 *
48 */
49//===========================================================================
50
51
52#ifndef SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
53#define SHARK_ALGORITHMS_KERNELBUDGETEDSGDTRAINER_H
54
55#include <iostream>
57
66
67
68namespace shark
69{
70
71
72///
73/// \brief Budgeted stochastic gradient descent training for kernel-based models.
74///
75/// \par This is an implementation of the BSGD algorithm, developed by
76/// Wang, Crammer and Vucetic: Breaking the curse of kernelization:
77/// Budgeted stochastic gradient descent for large-scale SVM training, JMLR 2012.
78/// Basically this is pegasos, so something similar to a perceptron. The main
79/// difference is that we do restrict the sparsity of the weight vector to a (currently
80/// predefined) value. Therefore, whenever this sparsity is reached, we have to
81/// decide how to add a new vector to the model, without destroying this
82/// sparsity. Several methods have been proposed for this, Wang et al. main
83/// insight is that merging two budget vectors (i.e. two vectors in the model).
84/// If the first one is searched by norm of its alpha coefficient, the second one
85/// can be found by some optimization problem, yielding a roughly optimal pair.
86/// This pair can be merged and by doing so the budget has now space for a
87/// new vector. Such strategies are called budget maintenance strategies.
88///
89/// \par This implementation owes much to the 'reference' implementation
90/// in the BudgetedSVM software.
91///
92/// \par For the documentation of the basic SGD algorithm, please refer to
93/// KernelSGDTrainer.h. Note that we did not take over the special alpha scaling
94/// from that class. Therefore this class is perhaps numerically not as robust as SGD.
95///
96template <class InputType, class CacheType = float>
97class KernelBudgetedSGDTrainer : public AbstractTrainer< KernelClassifier<InputType> >, public IParameterizable<>
98{
99public:
106 typedef CacheType QpFloatType;
107 typedef typename LabeledData<InputType, unsigned int>::element_type ElementType;
108
111
112
113
114 /// preinitialization methods
115 enum preInitializationMethod {NONE, RANDOM}; // TODO: add KMEANS
116
117
118
119 /// \brief Constructor
120 /// Note that there is no cache size involved, as merging vectors will always create new ones,
121 /// which makes caching roughly obsolete.
122 ///
123 /// \param[in] kernel kernel function to use for training and prediction
124 /// \param[in] loss (sub-)differentiable loss function
125 /// \param[in] C regularization parameter - always the 'true' value of C, even when unconstrained is set
126 /// \param[in] offset whether to train with offset/bias parameter or not
127 /// \param[in] unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
128 /// \param[in] budgetSize size of the budget/model that the final solution will have. Note that it might be smaller though.
129 /// \param[in] budgetMaintenanceStrategy object that contains the logic for maintaining the budget size.
130 /// \param[in] epochs number of epochs the SGD solver should run. if zero is given, the size will be the max of 10*datasetsize or C*datasetsize
131 /// \param[in] preInitializationMethod the method to preinitialize the budget.
132 /// \param[in] minMargin the margin every vector has to obey. Usually this is 1.
133 ///
136 const LossType* loss,
137 double C,
138 bool offset,
139 bool unconstrained = false,
140 size_t budgetSize = 500,
142 size_t epochs = 1,
144 double minMargin = 1.0f
145 ): m_kernel(kernel)
146 , m_loss(loss)
147 , m_C(C)
148 , m_offset(offset)
149 , m_unconstrained(unconstrained)
155 {
156
157 // check that the maintenance strategy is not null.
158 SHARK_RUNTIME_CHECK(m_budgetMaintenanceStrategy, "Budget maintenance strategy must not be NULL!");
159 SHARK_RUNTIME_CHECK(m_kernel, "Kernel must not be NULL!");
160 SHARK_RUNTIME_CHECK(m_loss, "Loss must not be NULL!");
161 }
162
163
164 /// get budget size
165 /// \return budget size
166 ///
167 size_t budgetSize() const
168 {
169 return m_budgetSize;
170 }
171
172
173 /// set budget size
174 /// \param[in] budgetSize size of budget.
175 ///
176 void setBudgetSize(std::size_t budgetSize)
177 {
179 }
180
181
182 /// return pointer to the budget maintenance strategy
183 /// \return pointer to the budget maintenance strategy.
184 ///
189
190
191 /// set budget maintenance strategy
192 /// \param[in] budgetMaintenanceStrategy set strategy to given object.
193 ///
198
199
200 /// return min margin
201 /// \return current min margin
202 ///
203 double minMargin() const
204 {
205 return m_minMargin;
206 }
207
208
209 /// set min margin
210 /// \param[in] minMargin new min margin.
211 ///
213 {
215 }
216
217
218 /// \brief From INameable: return the class name.
219 std::string name() const
220 {
221 return "KernelBudgetedSGDTrainer";
222 }
223
224
225 /// Train routine.
226 /// \param[in] classifier classifier object for the final solution.
227 /// \param[in] dataset dataset to work with.
228 ///
230 {
231
232 std::size_t ell = dataset.numberOfElements();
233 std::size_t classes = numberOfClasses(dataset);
234
235 // is the budget size larger than reasonable?
236 if(m_budgetSize > ell)
237 {
238 // in this case we just set the budgetSize to the given dataset size, so basically
239 // there is an infinite budget.
240 m_budgetSize = ell;
241 }
242
243 // we always need one budget vector more than the user specified,
244 // as we first have to add any new vector to the budget before applying
245 // the maintenance strategy. an alternative would be to keep the budget size
246 // correct and test explicitely for the new support vector, but that would
247 // create even more hassle on the other side. or one could use a vector of
248 // budget vectors instead, but loose the nice framework of kernel expansions.
249 // so the last budget vector must always have zero alpha coefficients in
250 // the final model. (we do not check for that but roughly assume that in
251 // the strategies, e.g. by putting the new vector to the last position in the
252 // merge strategy).
254
255 // easy access
256 UIntVector y = createBatch(dataset.labels().elements());
257
258 // create a preinitialized budget.
259 // this is used to initialize the kernelexpansion, we will work with.
260 LabeledData<InputType, unsigned int> preinitializedBudgetVectors(m_budgetSize, dataset.element(0));
261
262 // preinit the vectors first
263 // we still preinit even for no preinit, as we need the vectors in the
264 // constructor of the kernelexpansion. the alphas will be set to zero for none.
266 {
267 for(size_t j = 0; j < m_budgetSize; j++)
268 {
269 // choose a random vector
270 std::size_t b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
271
272 // copy over the vector
273 preinitializedBudgetVectors.element(j) = dataset.element(b);
274 }
275 }
276
277 /*
278 // TODO: kmeans initialization
279 if (m_preInitializationMethod == KMEANS) {
280 // the negative examples individually. the number of clusters should
281 // then follow the ratio of the classes. then we can set the alphas easily.
282 // TODO: do this multiclass
283 // TODO: maybe Kmedoid makes more sense because of the alphas.
284 // TODO: allow for different maxiters
285 Centroids centroids;
286 size_t maxIterations = 50;
287 kMeans (dataset.inputs(), m_budgetSize, centroids, maxIterations);
288
289 // copy over to our budget
290 Data<RealVector> const& c = centroids.centroids();
291
292 for (size_t j = 0; j < m_budgetSize; j++) {
293 preinitializedBudgetVectors.inputs().element (j) = c.element (j);
294 preinitializedBudgetVectors.labels().element (j) = 1; //FIXME
295 }
296 }
297 */
298
299 // budget is a kernel expansion in its own right
300 ModelType &budgetModel = classifier.decisionFunction();
301 RealMatrix &budgetAlpha = budgetModel.alpha();
302 budgetModel.setStructure(m_kernel, preinitializedBudgetVectors.inputs(), m_offset, classes);
303
304
305 // variables
306 const double lambda = 1.0 / (ell * m_C);
307 std::size_t iterations;
308
309
310 // set epoch number
311 if(m_epochs == 0)
312 iterations = std::max(10 * ell, std::size_t (std::ceil(m_C * ell)));
313 else
314 iterations = m_epochs * ell;
315
316
317 // set the initial alphas (we do this here, after the array has been initialized by setStructure)
319 {
320 for(size_t j = 0; j < m_budgetSize; j++)
321 {
322 size_t c = preinitializedBudgetVectors.labels().element(j);
323 budgetAlpha(j, c) = 1 / (1 + lambda);
324 budgetAlpha(j, (c + 1) % classes) = -1 / (1 + lambda);
325 }
326 }
327
328
329 // whatever strategy we did use-- the last budget vector needs
330 // to be zeroed out, either it was zero anyway (none preinit)
331 // or it is the extra budget vector we need for technical reasons
332 row(budgetAlpha, m_budgetSize - 1) *= 0;
333
334
335 // preinitialize everything to prevent costly memory allocations in the loop
336 RealVector predictions(classes, 0.0);
337 RealVector derivative(classes, 0.0);
338
339
340 // SGD loop
341 std::size_t b = 0;
342
343 for(std::size_t iter = 0; iter < iterations; iter++)
344 {
345 // active variable
346 b = random::discrete(random::globalRng, std::size_t(0), ell - 1);
347
348 // for smaller datasets instead of choosing randomly a sample
349 // permuting the dataset can be a valid strategy. We do not implement
350 // that here.
351
352 // compute prediction within the budgeted model
353 // this will compute the predictions for all classes in one step
354 budgetModel.eval(dataset.inputs().element(b), predictions);
355
356 // now we follow the crammer-singer model as written
357 // in paper (p. 11 top), we compute the scores of the true
358 // class and the runner-up class. for the latter we remove
359 // our true prediction temporarily and redo the argmax.
360
361 RealVector predictionsCopy = predictions;
362 unsigned int trueClass = y[b];
363 double scoreOfTrueClass = predictions[trueClass];
364 predictions[trueClass] = -std::numeric_limits<double>::infinity();
365 unsigned int runnerupClass = (unsigned int)arg_max(predictions);
366 double scoreOfRunnerupClass = predictions[runnerupClass];
367
368 SHARK_ASSERT(trueClass != runnerupClass);
369
370 // scale alphas
371 budgetModel.alpha() *= ((long double)(1.0 - 1.0 / (iter + 1.0)));
372
373 // check if there is a margin violation
374 if(scoreOfTrueClass - scoreOfRunnerupClass < m_minMargin)
375 {
376 // TODO: check if the current vector is already part of our budget
377
378 // as we do not use the predictions anymore, we use them to push the new alpha values
379 // to the budgeted model
380 predictions.clear();
381
382 // set the alpha values (see p 11, beta_t^{(i)} formula in wang, crammer, vucetic)
383 // alpha of true class
384 predictions[trueClass] = 1.0 / ((long double)(iter + 1.0) * lambda);
385
386 // alpha of runnerup class
387 predictions[runnerupClass] = -1.0 / ((long double)(iter + 1.0) * lambda);
388
389 m_budgetMaintenanceStrategy->addToModel(budgetModel, predictions, dataset.element(b));
390 }
391 }
392
393 // finally we need to get rid of zero supportvectors.
394 budgetModel.sparsify();
395
396 }
397
398 /// Return the number of training epochs.
399 /// A value of 0 indicates that the default of max(10, C) should be used.
400 std::size_t epochs() const
401 {
402 return m_epochs;
403 }
404
405 /// Set the number of training epochs.
406 /// A value of 0 indicates that the default of max(10, C) should be used.
407 void setEpochs(std::size_t value)
408 {
409 m_epochs = value;
410 }
411
412 /// get the kernel function
414 {
415 return m_kernel;
416 }
417 /// get the kernel function
418 const KernelType *kernel() const
419 {
420 return m_kernel;
421 }
422 /// set the kernel function
424 {
426 }
427
428 /// check whether the parameter C is represented as log(C), thus,
429 /// in a form suitable for unconstrained optimization, in the
430 /// parameter vector
431 bool isUnconstrained() const
432 {
433 return m_unconstrained;
434 }
435
436 /// return the value of the regularization parameter
437 double C() const
438 {
439 return m_C;
440 }
441
442 /// set the value of the regularization parameter (must be positive)
443 void setC(double value)
444 {
445 RANGE_CHECK(value > 0.0);
446 m_C = value;
447 }
448
449 /// check whether the model to be trained should include an offset term
450 bool trainOffset() const
451 {
452 return m_offset;
453 }
454
455 ///\brief Returns the vector of hyper-parameters.
456 RealVector parameterVector() const {
458 return m_kernel->parameterVector() | log(m_C);
459 else
460 return m_kernel->parameterVector() | m_C;
461 }
462
463 ///\brief Sets the vector of hyper-parameters.
464 void setParameterVector(RealVector const &newParameters)
465 {
466 size_t kp = m_kernel->numberOfParameters();
467 SHARK_ASSERT(newParameters.size() == kp + 1);
468 m_kernel->setParameterVector(subrange(newParameters,0,kp));
469 m_C = newParameters.back();
470 if(m_unconstrained) m_C = exp(m_C);
471 }
472
473 ///\brief Returns the number of hyper-parameters.
474 size_t numberOfParameters() const
475 {
476 return m_kernel->numberOfParameters() + 1;
477 }
478
479protected:
480 KernelType *m_kernel; ///< pointer to kernel function
481 const LossType *m_loss; ///< pointer to loss function
482 double m_C; ///< regularization parameter
483 bool m_offset; ///< should the resulting model have an offset term?
484 bool m_unconstrained; ///< should C be stored as log(C) as a parameter?
485
486 // budget size
487 std::size_t m_budgetSize;
488
489 // budget maintenance strategy
491
492 std::size_t m_epochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
493
494 // method to preinitialize budget
496
497 // needed margin below which we update the model, also called beta sometimes
499};
500
501}
502#endif