Pegasos.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Pegasos solvers for linear SVMs
6 *
7 *
8 *
9 * \author T. Glasmachers
10 * \date 2012
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35
36#ifndef SHARK_ALGORITHMS_PEGASOS_H
37#define SHARK_ALGORITHMS_PEGASOS_H
38
39
40#include <shark/LinAlg/Base.h>
41#include <shark/Data/Dataset.h>
42#include <shark/Core/Random.h>
43#include <cmath>
44#include <iostream>
45
46
47namespace shark {
48
49
50///
51/// \brief Pegasos solver for linear (binary) support vector machines.
52///
53template <class VectorType>
55{
56public:
57 /// \brief Solve the primal SVM problem.
58 ///
59 /// In addition to "standard" Pegasos this solver checks a
60 /// meaningful stopping criterion.
61 ///
62 /// The function returns the number of model predictions
63 /// during training (this is comparable to SMO iterations).
64 template <class WeightType>
65 static std::size_t solve(
66 LabeledData<VectorType, unsigned int> const& data, ///< training data
67 double C, ///< SVM regularization parameter
68 WeightType& w, ///< weight vector
69 std::size_t batchsize = 1, ///< number of samples in each gradient estimate
70 double varepsilon = 0.001) ///< solution accuracy (factor by which the primal gradient should be reduced)
71 {
72 std::size_t ell = data.numberOfElements();
73 double lambda = 1.0 / (ell * C);
74 SHARK_ASSERT(batchsize > 0);
75
76 double initialPrimal = 1.0;
77 double normbound2 = initialPrimal / lambda; // upper bound for |sigma * w|^2
78 double norm_w2 = 0.0; // squared norm of w
79 double sigma = 1.0; // scaling factor for w
80 VectorType gradient(w.size()); // gradient (to be computed in each iteration)
81 w = RealZeroVector(w.size()); // clear does not work on matrix rows (ublas sucks!)
82
83 // pegasos main loop
84 std::size_t start = 10;
85 std::size_t checkinterval = (2 * ell) / batchsize;
86 std::size_t nextcheck = start + ell / batchsize;
87 std::size_t predictions = 0;
88 for (std::size_t t=start; ; t++)
89 {
90 // check the stopping criterion: \|gradient\| < epsilon ?
91 if (t >= nextcheck)
92 {
93 // compute the gradient
94 gradient = (lambda * sigma * (double)ell) * w;
95 for (std::size_t i=0; i<ell; i++)
96 {
97 VectorType const& x = data(i).input;
98 unsigned int y = data(i).label;
99 double f = sigma * inner_prod(w, x);
100 lg(x, y, f, gradient);
101 }
102 predictions += ell;
103
104 // compute the norm of the gradient
105 double n2 = inner_prod(gradient, gradient);
106 double n = std::sqrt(n2) / (double)ell;
107
108 // check the stopping criterion
109 if (n < varepsilon)
110 {
111// std::cout << "target accuracy reached." << std::endl;
112// std::cout << "accuracy: " << n << std::endl;
113// std::cout << "predictions: " << predictions << std::endl;
114 break;
115 }
116
117 nextcheck = t + checkinterval;
118 }
119
120 // compute the gradient
121 gradient.clear();
122 bool nonzero = true;
123 for (unsigned int i=0; i<batchsize; i++)
124 {
125 // select the active variable (sample with replacement)
126 std::size_t active = random::discrete(0, ell-1);
127 VectorType const& x = data(active).input;
128 unsigned int y = data(active).label;
129 SHARK_ASSERT(y < 2);
130
131 // compute the prediction
132 double f = sigma * inner_prod(w, x);
133 predictions++;
134
135 // compute the loss gradient
136 lg(x, y, f, gradient);
137 }
138
139 // update
140 sigma *= (1.0 - 1.0 / (double)t);
141 if (nonzero)
142 {
143 double eta = 1.0 / (sigma * lambda * t * batchsize);
144 gradient *= eta;
145 norm_w2 += inner_prod(gradient, gradient) - 2.0 * inner_prod(w, gradient);
146 noalias(w) -= gradient;
147
148 // project to the ball
149 double n2 = sigma * sigma * norm_w2;
150 if (n2 > normbound2) sigma *= std::sqrt(normbound2 / n2);
151 }
152 }
153
154 // rescale the solution
155 w *= sigma;
156 return predictions;
157 }
158
159protected:
160 // gradient of the loss
161 static bool lg(
162 VectorType const& x,
163 unsigned int y,
164 double f,
165 VectorType& gradient)
166 {
167 if (y == 0)
168 {
169 if (f > -1.0)
170 {
171 gradient += x;
172 return true;
173 }
174 }
175 else if (y == 1)
176 {
177 if (f < 1.0)
178 {
179 gradient -= x;
180 return true;
181 }
182 }
183 return false;
184 }
185};
186
187
188///
189/// \brief Pegasos solver for linear multi-class support vector machines.
190///
191template <class VectorType>
193{
194public:
195 /// \brief Multi-class margin type.
201
202 /// \brief Multi-class loss function type.
211
212 /// \brief Solve the primal multi-class SVM problem.
213 ///
214 /// In addition to "standard" Pegasos this solver checks a
215 /// meaningful stopping criterion.
216 ///
217 /// The function returns the number of model predictions
218 /// during training (this is comparable to SMO iterations).
219 template <class WeightType>
220 static std::size_t solve(
221 LabeledData<VectorType, unsigned int> const& data, ///< training data
222 eMarginType margintype, ///< margin function type
223 eLossType losstype, ///< loss function type
224 bool sumToZero, ///< enforce the sum-to-zero constraint?
225 double C, ///< SVM regularization parameter
226 std::vector<WeightType>& w, ///< class-wise weight vectors
227 std::size_t batchsize = 1, ///< number of samples in each gradient estimate
228 double varepsilon = 0.001) ///< solution accuracy (factor by which the primal gradient should be reduced)
229 {
230 SHARK_ASSERT(batchsize > 0);
231 std::size_t ell = data.numberOfElements();
232 unsigned int classes = w.size();
233 SHARK_ASSERT(classes >= 2);
234 double lambda = 1.0 / (ell * C);
235
236 double initialPrimal = -1.0;
237 LossGradientFunction lg = NULL;
238 if (margintype == emRelative)
239 {
240 if (losstype == elDiscriminativeMax || losstype == elTotalMax)
241 {
242 // CS case
243 initialPrimal = 1.0;
244 lg = lossGradientRDM;
245 }
246 else if (losstype == elDiscriminativeSum || losstype == elTotalSum)
247 {
248 // WW case
249 initialPrimal = classes - 1.0;
250 lg = lossGradientRDS;
251 }
252 }
253 else if (margintype == emAbsolute)
254 {
255 if (losstype == elNaiveHinge)
256 {
257 // MMR case
258 initialPrimal = 1.0;
259 lg = lossGradientANH;
260 }
261 else if (losstype == elDiscriminativeMax)
262 {
263 // ADM case
264 initialPrimal = 1.0;
265 lg = lossGradientADM;
266 }
267 else if (losstype == elDiscriminativeSum)
268 {
269 // LLW case
270 initialPrimal = classes - 1.0;
271 lg = lossGradientADS;
272 }
273 else if (losstype == elTotalMax)
274 {
275 // ATM case
276 initialPrimal = 1.0;
277 lg = lossGradientATM;
278 }
279 else if (losstype == elTotalSum)
280 {
281 // ATS/OVA case
282 initialPrimal = classes;
283 lg = lossGradientATS;
284 }
285 }
286 SHARK_RUNTIME_CHECK(initialPrimal > 0 && lg, "The combination of margin and loss is not implemented");
287
288 double normbound2 = initialPrimal / lambda; // upper bound for |sigma * w|^2
289 double norm_w2 = 0.0; // squared norm of w
290 double sigma = 1.0; // scaling factor for w
291 double target = initialPrimal * varepsilon; // target gradient norm
292 std::vector<VectorType> gradient(classes); // gradient (to be computed in each iteration)
293 RealVector f(classes); // machine prediction (computed for each example)
294 for (unsigned int c=0; c<classes; c++)
295 {
296 gradient[c].resize(w[c].size());
297 w[c] = RealZeroVector(w[c].size());
298 }
299
300 // pegasos main loop
301 std::size_t start = 10;
302 std::size_t checkinterval = (2 * ell) / batchsize;
303 std::size_t nextcheck = start + ell / batchsize;
304 std::size_t predictions = 0;
305 for (std::size_t t=start; ; t++)
306 {
307 // check the stopping criterion: \|gradient\| < epsilon ?
308 if (t >= nextcheck)
309 {
310 // compute the gradient
311 for (unsigned int c=0; c<classes; c++) gradient[c] = (lambda * sigma * (double)ell) * w[c];
312 for (std::size_t i=0; i<ell; i++)
313 {
314 VectorType const& x = data(i).input;
315 unsigned int y = data(i).label;
316 for (unsigned int c=0; c<classes; c++) f(c) = sigma * inner_prod(w[c], x);
317 lg(x, y, f, gradient, sumToZero);
318 }
319 predictions += ell;
320
321 // compute the norm of the gradient
322 double n2 = 0.0;
323 for (unsigned int c=0; c<classes; c++) n2 += inner_prod(gradient[c], gradient[c]);
324 double n = std::sqrt(n2) / (double)ell;
325
326 // check the stopping criterion
327 if (n < target)
328 {
329// std::cout << "target accuracy reached." << std::endl;
330// std::cout << "accuracy: " << n << std::endl;
331// std::cout << "predictions: " << predictions << std::endl;
332 break;
333 }
334
335 nextcheck = t + checkinterval;
336 }
337
338 // compute the gradient
339 for (unsigned int c=0; c<classes; c++) gradient[c].clear();
340 bool nonzero = true;
341 for (unsigned int i=0; i<batchsize; i++)
342 {
343 // select the active variable (sample with replacement)
344 std::size_t active = random::discrete(0, ell-1);
345 VectorType const& x = data(active).input;
346 unsigned int y = data(active).label;
347 SHARK_ASSERT(y < classes);
348
349 // compute the prediction
350 for (unsigned int c=0; c<classes; c++) f(c) = sigma * inner_prod(w[c], x);
351 predictions++;
352
353 // compute the loss gradient
354 lg(x, y, f, gradient, sumToZero);
355 }
356
357 // update
358 sigma *= (1.0 - 1.0 / (double)t);
359 if (nonzero)
360 {
361 double eta = 1.0 / (sigma * lambda * t * batchsize);
362 for (unsigned int c=0; c<classes; c++)
363 {
364 gradient[c] *= eta;
365 norm_w2 += inner_prod(gradient[c], gradient[c]) - 2.0 * inner_prod(w[c], gradient[c]);
366 noalias(w[c]) -= gradient[c];
367 }
368
369 // project to the ball
370 double n2 = sigma * sigma * norm_w2;
371 if (n2 > normbound2) sigma *= std::sqrt(normbound2 / n2);
372 }
373 }
374
375 // rescale the solution
376 for (unsigned int c=0; c<classes; c++) w[c] *= sigma;
377 return predictions;
378 }
379
380protected:
381 // Function type for the computation of the gradient
382 // of the loss. A return value of true indicates that
383 // the gradient is non-zero.
384 typedef bool(*LossGradientFunction)(VectorType const&, unsigned int, RealVector const&, std::vector<VectorType>&, bool);
385
386 // absolute margin, naive hinge loss
387 static bool lossGradientANH(
388 VectorType const& x,
389 unsigned int y,
390 RealVector const& f,
391 std::vector<VectorType>& gradient,
392 bool sumToZero)
393 {
394 if (f(y) < 1.0)
395 {
396 gradient[y] -= x;
397 if (sumToZero)
398 {
399 VectorType xx = (1.0 / (f.size() - 1.0)) * x;
400 for (std::size_t c=0; c<f.size(); c++) if (c != y) gradient[c] += xx;
401 }
402 return true;
403 }
404 else return false;
405 }
406
407 // relative margin, max loss
408 static bool lossGradientRDM(
409 VectorType const& x,
410 unsigned int y,
411 RealVector const& f,
412 std::vector<VectorType>& gradient,
413 bool sumToZero)
414 {
415 unsigned int argmax = 0;
416 double max = -1e100;
417 for (std::size_t c=0; c<f.size(); c++)
418 {
419 if (c != y && f(c) > max)
420 {
421 max = f(c);
422 argmax = c;
423 }
424 }
425 if (f(y) < 1.0 + max)
426 {
427 gradient[y] -= x;
428 gradient[argmax] += x;
429 return true;
430 }
431 else return false;
432 }
433
434 // relative margin, sum loss
435 static bool lossGradientRDS(
436 VectorType const& x,
437 unsigned int y,
438 RealVector const& f,
439 std::vector<VectorType>& gradient,
440 bool sumToZero)
441 {
442 bool nonzero = false;
443 for (std::size_t c=0; c<f.size(); c++)
444 {
445 if (c != y && f(y) < 1.0 + f(c))
446 {
447 gradient[y] -= x;
448 gradient[c] += x;
449 nonzero = true;
450 }
451 }
452 return nonzero;
453 }
454
455 // absolute margin, discriminative sum loss
456 static bool lossGradientADS(
457 VectorType const& x,
458 unsigned int y,
459 RealVector const& f,
460 std::vector<VectorType>& gradient,
461 bool sumToZero)
462 {
463 bool nonzero = false;
464 for (std::size_t c=0; c<f.size(); c++)
465 {
466 if (c != y && f(c) > -1.0)
467 {
468 gradient[c] += x;
469 nonzero = true;
470 }
471 }
472 if (sumToZero && nonzero)
473 {
474 VectorType mean = gradient[0];
475 for (std::size_t c=1; c<f.size(); c++) mean += gradient[c];
476 mean /= f.size();
477 for (std::size_t c=0; c<f.size(); c++) gradient[c] -= mean;
478 }
479 return nonzero;
480 }
481
482 // absolute margin, discriminative max loss
483 static bool lossGradientADM(
484 VectorType const& x,
485 unsigned int y,
486 RealVector const& f,
487 std::vector<VectorType>& gradient,
488 bool sumToZero)
489 {
490 double max = -1e100;
491 std::size_t argmax = 0;
492 for (std::size_t c=0; c<f.size(); c++)
493 {
494 if (c == y) continue;
495 if (f(c) > max)
496 {
497 max = f(c);
498 argmax = c;
499 }
500 }
501 if (max > -1.0)
502 {
503 gradient[argmax] += x;
504 if (sumToZero)
505 {
506 VectorType xx = (1.0 / (f.size() - 1.0)) * x;
507 for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] -= xx;
508 }
509 return true;
510 }
511 else return false;
512 }
513
514 // absolute margin, total sum loss
515 static bool lossGradientATS(
516 VectorType const& x,
517 unsigned int y,
518 RealVector const& f,
519 std::vector<VectorType>& gradient,
520 bool sumToZero)
521 {
522 bool nonzero = false;
523 for (std::size_t c=0; c<f.size(); c++)
524 {
525 if (c == y)
526 {
527 if (f(c) < 1.0)
528 {
529 gradient[c] -= x;
530 nonzero = true;
531 }
532 }
533 else
534 {
535 if (f(c) > -1.0)
536 {
537 gradient[c] += x;
538 nonzero = true;
539 }
540 }
541 }
542 if (sumToZero && nonzero)
543 {
544 VectorType mean = gradient[0];
545 for (std::size_t c=1; c<f.size(); c++) mean += gradient[c];
546 mean /= f.size();
547 for (std::size_t c=0; c<f.size(); c++) gradient[c] -= mean;
548 }
549 return nonzero;
550 }
551
552 // absolute margin, total max loss
553 static bool lossGradientATM(
554 VectorType const& x,
555 unsigned int y,
556 RealVector const& f,
557 std::vector<VectorType>& gradient,
558 bool sumToZero)
559 {
560 double max = -1e100;
561 std::size_t argmax = 0;
562 for (std::size_t c=0; c<f.size(); c++)
563 {
564 if (c == y)
565 {
566 if (-f(c) > max)
567 {
568 max = -f(c);
569 argmax = c;
570 }
571 }
572 else
573 {
574 if (f(c) > max)
575 {
576 max = f(c);
577 argmax = c;
578 }
579 }
580 }
581 if (max > -1.0)
582 {
583 if (argmax == y)
584 {
585 gradient[argmax] -= x;
586 if (sumToZero)
587 {
588 VectorType xx = (1.0 / (f.size() - 1.0)) * x;
589 for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] += xx;
590 }
591 }
592 else
593 {
594 gradient[argmax] += x;
595 if (sumToZero)
596 {
597 VectorType xx = (1.0 / (f.size() - 1.0)) * x;
598 for (std::size_t c=0; c<f.size(); c++) if (c != argmax) gradient[c] -= xx;
599 }
600 }
601 return true;
602 }
603 else return false;
604 }
605};
606
607
608}
609#endif