QpBoxLinear.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Quadratic programming solver linear SVM training without bias
6 *
7 *
8 *
9 *
10 * \author T. Glasmachers
11 * \date -
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36
37#ifndef SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
38#define SHARK_ALGORITHMS_QP_QPBOXLINEAR_H
39
40#include <shark/Core/Timer.h>
42#include <shark/Data/Dataset.h>
43#include <shark/Data/DataView.h>
44#include <shark/LinAlg/Base.h>
45#include <cmath>
46#include <iostream>
47
48
49namespace shark {
50
51
52///
53/// \brief Quadratic program solver for box-constrained problems with linear kernel
54///
55/// \par
56/// The QpBoxLinear class is a decomposition-based solver for linear
57/// support vector machine classifiers trained with the hinge loss.
58/// Its optimization is largely based on the paper<br>
59/// "A Dual Coordinate Descent Method for Large-scale Linear SVM"
60/// by Hsieh, Chang, and Lin, ICML, 2007.
61/// However, the present algorithm differs quite a bit, since it
62/// learns variable preferences as a replacement for the missing
63/// working set selection. At the same time, this method replaces
64/// the shrinking heuristic.
65///
66template <class InputT>
68{
69public:
72
73 ///
74 /// \brief Constructor
75 ///
76 /// \param dataset training data
77 /// \param dim problem dimension
78 ///
79 QpBoxLinear(const DatasetType& dataset, std::size_t dim)
80 : m_data(dataset)
81 , m_dim(dim)
82 , m_xSquared(m_data.size())
83 , m_alpha(m_data.size(),0.0)
84 , m_weights(m_dim,0.0)
85 , m_pref(m_data.size(),1.0)
86 , m_offset(0)
87
88 {
89 SHARK_ASSERT(dim > 0);
90
91 // pre-compute squared norms
92 for (std::size_t i=0; i<m_data.size(); i++)
93 {
94 auto const& x_i = m_data[i];
95 m_xSquared(i) = norm_sqr(x_i.input);
96 }
97 }
98
99 void setOffset(double newOffset){
100 m_offset = newOffset;
101 }
102
103 double offsetGradient()const{
104 double result = 0;
105 for(std::size_t i = 0; i != m_data.size(); ++i){
106 double y_i = (m_data[i].label > 0) ? +1.0 : -1.0;
107 result += m_alpha(i) * y_i;
108 }
109 return result;
110 }
111
112 RealVector const& solutionWeightVector()const{
113 return m_weights;
114 }
115
116 ///
117 /// \brief Solve the SVM training problem.
118 ///
119 /// \param bound upper bound for m_alpha-components, complexity parameter of the hinge loss SVM
120 /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\m_alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
121 /// \param stop stopping condition(s)
122 /// \param prop solution properties
123 /// \param verbose if true, the solver prints status information and solution statistics
124 ///
125 void solve(
126 double bound,
127 double reg,
129 QpSolutionProperties* prop = NULL,
130 bool verbose = false
131 ){
132 // sanity checks
133 SHARK_ASSERT(bound > 0.0);
134 SHARK_ASSERT(reg >= 0.0);
135
136 // measure training time
137 Timer timer;
138
139 // prepare dimensions and vectors
140 std::size_t ell = m_data.size();
141 double prefsum = sum(m_pref); // normalization constant for m_pref
142 std::vector<std::size_t> schedule(ell);
143
144 // prepare counters
145 std::size_t epoch = 0;
146 std::size_t steps = 0;
147
148 // prepare performance monitoring for self-adaptation
149 double max_violation = 0.0;
150 const double gain_learning_rate = 1.0 / ell;
151 double average_gain = 0.0;
152 bool canstop = true;
153
154 // outer optimization loop
155 while (true)
156 {
157 // define schedule
158 double psum = prefsum;
159 prefsum = 0.0;
160 std::size_t pos = 0;
161 for (std::size_t i=0; i<ell; i++)
162 {
163 double p = m_pref[i];
164 double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
165 std::size_t n = (std::size_t)std::floor(num);
166 double prob = num - n;
167 if (random::coinToss(random::globalRng,prob)) n++;
168 for (std::size_t j=0; j<n; j++)
169 {
170 schedule[pos] = i;
171 pos++;
172 }
173 psum -= p;
174 prefsum += p;
175 }
176 SHARK_ASSERT(pos == ell);
177 std::shuffle(schedule.begin(),schedule.end(),random::globalRng);
178
179 // inner loop
180 max_violation = 0.0;
181 for (std::size_t j=0; j<ell; j++)
182 {
183 // active variable
184 std::size_t i = schedule[j];
185 auto const& e_i = m_data[i];
186 double y_i = (e_i.label > 0) ? +1.0 : -1.0;
187
188 // compute gradient and projected gradient
189 double a = m_alpha(i);
190 double wyx = y_i * inner_prod(m_weights, e_i.input);
191 double g = 1.0 - m_offset * y_i - wyx - reg * a;
192 double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
193
194 // update maximal KKT violation over the epoch
195 max_violation = std::max(max_violation, std::abs(pg));
196 double gain = 0.0;
197
198 // perform the step
199 if (pg != 0.0)
200 {
201 // SMO-style coordinate descent step
202 double q = m_xSquared(i) + reg;
203 double mu = g / q;
204 double new_a = a + mu;
205
206 // numerically stable update
207 if (new_a <= 0.0)
208 {
209 mu = -a;
210 new_a = 0.0;
211 }
212 else if (new_a >= bound)
213 {
214 mu = bound - a;
215 new_a = bound;
216 }
217
218 // update both representations of the weight vector: m_alpha and m_weights
219 m_alpha(i) = new_a;
220 noalias(m_weights) += (mu * y_i) * e_i.input;
221 gain = mu * (g - 0.5 * q * mu);
222
223 steps++;
224 }
225
226 // update gain-based preferences
227 {
228 if (epoch == 0) average_gain += gain / (double)ell;
229 else
230 {
231 // strategy constants
232 constexpr double CHANGE_RATE = 0.2;
233 constexpr double PREF_MIN = 0.05;
234 constexpr double PREF_MAX = 20.0;
235
236 double change = CHANGE_RATE * (gain / average_gain - 1.0);
237 double newpref = std::min(PREF_MAX, std::max(PREF_MIN, m_pref(i) * std::exp(change)));
238 prefsum += newpref - m_pref(i);
239 m_pref[i] = newpref;
240 average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
241 }
242 }
243 }
244
245 epoch++;
246
247 // stopping criteria
248 if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
249 {
250 if (prop != NULL) prop->type = QpMaxIterationsReached;
251 break;
252 }
253
254 if (timer.stop() >= stop.maxSeconds)
255 {
256 if (prop != NULL) prop->type = QpTimeout;
257 break;
258 }
259
260 if (max_violation < stop.minAccuracy)
261 {
262 if (verbose) std::cout << "#" << std::flush;
263 if (canstop)
264 {
265 if (prop != NULL) prop->type = QpAccuracyReached;
266 break;
267 }
268 else
269 {
270 // prepare full sweep for a reliable checking of the stopping criterion
271 canstop = true;
272 for (std::size_t i=0; i<ell; i++) m_pref[i] = 1.0;
273 prefsum = (double)ell;
274 }
275 }
276 else
277 {
278 if (verbose) std::cout << "." << std::flush;
279 canstop = false;
280 }
281 }
282
283 timer.stop();
284
285 // compute solution statistics
286 std::size_t free_SV = 0;
287 std::size_t bounded_SV = 0;
288 double objective = -0.5 * norm_sqr(m_weights);
289 for (std::size_t i=0; i<ell; i++)
290 {
291 double a = m_alpha(i);
292 if (a > 0.0)
293 {
294 objective += a;
295 objective -= reg/2.0 * a * a;
296 if (a == bound) bounded_SV++;
297 else free_SV++;
298 }
299 }
300
301 // return solution statistics
302 if (prop != NULL)
303 {
304 prop->accuracy = max_violation; // this is approximate, but a good guess
305 prop->iterations = ell * epoch;
306 prop->value = objective;
307 prop->seconds = timer.lastLap();
308 }
309
310 // output solution statistics
311 if (verbose)
312 {
313 std::cout << std::endl;
314 std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
315 std::cout << "number of epochs: " << epoch << std::endl;
316 std::cout << "number of iterations: " << (ell * epoch) << std::endl;
317 std::cout << "number of non-zero steps: " << steps << std::endl;
318 std::cout << "dual accuracy: " << max_violation << std::endl;
319 std::cout << "dual objective value: " << objective << std::endl;
320 std::cout << "number of free support vectors: " << free_SV << std::endl;
321 std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
322 }
323 }
324
325protected:
326 DataView<const DatasetType> m_data; ///< view on training data
327 std::size_t m_dim; ///< input space dimension
328 RealVector m_xSquared; ///< diagonal entries of the quadratic matrix
329 RealVector m_alpha; ///< storage of the m_alpha values for warm start
330 RealVector m_weights; ///< storage of weight vector for warm start
331 RealVector m_pref; ///< measure of success of individual steps
332 double m_offset;
333};
334
335
336//~ template < >
337//~ class QpBoxLinear<CompressedRealVector>
338//~ {
339//~ public:
340 //~ typedef LabeledData<CompressedRealVector, unsigned int> DatasetType;
341
342 //~ ///
343 //~ /// \brief Constructor
344 //~ ///
345 //~ /// \param dataset training data
346 //~ /// \param dim problem dimension
347 //~ ///
348 //~ QpBoxLinear(const DatasetType& dataset, std::size_t dim)
349 //~ : x(dataset.numberOfElements())
350 //~ , y(dataset.numberOfElements())
351 //~ , diagonal(dataset.numberOfElements())
352 //~ , m_dim(dim)
353 //~ {
354 //~ SHARK_ASSERT(dim > 0);
355
356 //~ // transform ublas sparse vectors into a fast format
357 //~ // (yes, ublas is slow...), and compute the diagonal
358 //~ // elements of the quadratic matrix
359 //~ SparseVector sparse;
360 //~ for (std::size_t b=0, j=0; b<dataset.numberOfBatches(); b++)
361 //~ {
362 //~ DatasetType::const_batch_reference batch = dataset.batch(b);
363 //~ for (std::size_t i=0; i<batch.size(); i++)
364 //~ {
365 //~ auto const& x_i = shark::get(batch, i).input;
366 //~ // if (x_i.nnz() == 0) continue;
367
368 //~ unsigned int y_i = shark::get(batch, i).label;
369 //~ y[j] = 2.0 * y_i - 1.0;
370 //~ double d = 0.0;
371 //~ for (auto it=x_i.begin(); it != x_i.end(); ++it)
372 //~ {
373 //~ double v = *it;
374 //~ sparse.index = it.index();
375 //~ sparse.value = v;
376 //~ storage.push_back(sparse);
377 //~ d += v * v;
378 //~ }
379 //~ sparse.index = (std::size_t)-1;
380 //~ storage.push_back(sparse);
381 //~ diagonal(j) = d;
382 //~ j++;
383 //~ }
384 //~ }
385 //~ for (std::size_t b=0, j=0, k=0; b<dataset.numberOfBatches(); b++)
386 //~ {
387 //~ DatasetType::const_batch_reference batch = dataset.batch(b);
388 //~ for (std::size_t i=0; i<batch.size(); i++)
389 //~ {
390 //~ auto const& x_i = shark::get(batch, i).input;
391 //~ // if (x_i.nnz() == 0) continue;
392
393 //~ x[j] = &storage[k]; // cannot be done in the first loop because of vector reallocation
394 //~ for (; storage[k].index != (std::size_t)-1; k++);
395 //~ k++;
396 //~ j++;
397 //~ }
398 //~ }
399 //~ }
400
401 //~ ///
402 //~ /// \brief Solve the SVM training problem.
403 //~ ///
404 //~ /// \param bound upper bound for m_alpha-components, complexity parameter of the hinge loss SVM
405 //~ /// \param reg coefficient of the penalty term \f$-\frac{reg}{2} \cdot \|\m_alpha\|^2\f$, reg=1/C where C is the complexity parameter of the squared hinge loss SVM
406 //~ /// \param stop stopping condition(s)
407 //~ /// \param prop solution properties
408 //~ /// \param verbose if true, the solver prints status information and solution statistics
409 //~ ///
410 //~ RealVector solve(
411 //~ double bound,
412 //~ double reg,
413 //~ QpStoppingCondition& stop,
414 //~ QpSolutionProperties* prop = NULL,
415 //~ bool verbose = false)
416 //~ {
417 //~ // sanity checks
418 //~ SHARK_ASSERT(bound > 0.0);
419 //~ SHARK_ASSERT(reg >= 0.0);
420
421 //~ // measure training time
422 //~ Timer timer;
423 //~ timer.start();
424
425 //~ // prepare dimensions and vectors
426 //~ std::size_t ell = x.size();
427 //~ RealVector m_alpha(ell, 0.0);
428 //~ RealVector m_weights(m_dim, 0.0);
429 //~ RealVector m_pref(ell, 1.0); // measure of success of individual steps
430 //~ double prefsum = ell; // normalization constant
431 //~ std::vector<std::size_t> schedule(ell);
432
433 //~ // prepare counters
434 //~ std::size_t epoch = 0;
435 //~ std::size_t steps = 0;
436
437 //~ // prepare performance monitoring for self-adaptation
438 //~ double max_violation = 0.0;
439 //~ const double gain_learning_rate = 1.0 / ell;
440 //~ double average_gain = 0.0;
441 //~ bool canstop = true;
442
443 //~ // outer optimization loop
444 //~ while (true)
445 //~ {
446 //~ // define schedule
447 //~ double psum = prefsum;
448 //~ prefsum = 0.0;
449 //~ std::size_t pos = 0;
450 //~ for (std::size_t i=0; i<ell; i++)
451 //~ {
452 //~ double p = m_pref[i];
453 //~ double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
454 //~ std::size_t n = (std::size_t)std::floor(num);
455 //~ double prob = num - n;
456 //~ if (random::uni() < prob) n++;
457 //~ for (std::size_t j=0; j<n; j++)
458 //~ {
459 //~ schedule[pos] = i;
460 //~ pos++;
461 //~ }
462 //~ psum -= p;
463 //~ prefsum += p;
464 //~ }
465 //~ SHARK_ASSERT(pos == ell);
466 //~ for (std::size_t i=0; i<ell; i++) std::swap(schedule[i], schedule[random::discrete(0, ell - 1)]);
467
468 //~ // inner loop
469 //~ max_violation = 0.0;
470 //~ for (std::size_t j=0; j<ell; j++)
471 //~ {
472 //~ // active variable
473 //~ std::size_t i = schedule[j];
474 //~ const SparseVector* x_i = x[i];
475
476 //~ // compute gradient and projected gradient
477 //~ double a = m_alpha(i);
478 //~ double wyx = y(i) * inner_prod(m_weights, x_i);
479 //~ double g = 1.0 - wyx - reg * a;
480 //~ double pg = (a == 0.0 && g < 0.0) ? 0.0 : (a == bound && g > 0.0 ? 0.0 : g);
481
482 //~ // update maximal KKT violation over the epoch
483 //~ max_violation = std::max(max_violation, std::abs(pg));
484 //~ double gain = 0.0;
485
486 //~ // perform the step
487 //~ if (pg != 0.0)
488 //~ {
489 //~ // SMO-style coordinate descent step
490 //~ double q = diagonal(i) + reg;
491 //~ double mu = g / q;
492 //~ double new_a = a + mu;
493
494 //~ // numerically stable update
495 //~ if (new_a <= 0.0)
496 //~ {
497 //~ mu = -a;
498 //~ new_a = 0.0;
499 //~ }
500 //~ else if (new_a >= bound)
501 //~ {
502 //~ mu = bound - a;
503 //~ new_a = bound;
504 //~ }
505
506 //~ // update both representations of the weight vector: m_alpha and m_weights
507 //~ m_alpha(i) = new_a;
508 //~ // m_weights += (mu * y(i)) * x_i;
509 //~ axpy(m_weights, mu * y(i), x_i);
510 //~ gain = mu * (g - 0.5 * q * mu);
511
512 //~ steps++;
513 //~ }
514
515 //~ // update gain-based preferences
516 //~ {
517 //~ if (epoch == 0) average_gain += gain / (double)ell;
518 //~ else
519 //~ {
520 //~ // strategy constants
521 //~ constexpr double CHANGE_RATE = 0.2;
522 //~ constexpr double PREF_MIN = 0.05;
523 //~ constexpr double PREF_MAX = 20.0;
524
525 //~ double change = CHANGE_RATE * (gain / average_gain - 1.0);
526 //~ double newpref = std::min(PREF_MAX, std::max(PREF_MIN, m_pref(i) * std::exp(change)));
527 //~ prefsum += newpref - m_pref(i);
528 //~ m_pref[i] = newpref;
529 //~ average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
530 //~ }
531 //~ }
532 //~ }
533
534 //~ epoch++;
535
536 //~ // stopping criteria
537 //~ if (stop.maxIterations > 0 && ell * epoch >= stop.maxIterations)
538 //~ {
539 //~ if (prop != NULL) prop->type = QpMaxIterationsReached;
540 //~ break;
541 //~ }
542
543 //~ if (timer.stop() >= stop.maxSeconds)
544 //~ {
545 //~ if (prop != NULL) prop->type = QpTimeout;
546 //~ break;
547 //~ }
548
549 //~ if (max_violation < stop.minAccuracy)
550 //~ {
551 //~ if (verbose) std::cout << "#" << std::flush;
552 //~ if (canstop)
553 //~ {
554 //~ if (prop != NULL) prop->type = QpAccuracyReached;
555 //~ break;
556 //~ }
557 //~ else
558 //~ {
559 //~ // prepare full sweep for a reliable checking of the stopping criterion
560 //~ canstop = true;
561 //~ for (std::size_t i=0; i<ell; i++) m_pref[i] = 1.0;
562 //~ prefsum = ell;
563 //~ }
564 //~ }
565 //~ else
566 //~ {
567 //~ if (verbose) std::cout << "." << std::flush;
568 //~ canstop = false;
569 //~ }
570 //~ }
571
572 //~ timer.stop();
573
574 //~ // compute solution statistics
575 //~ std::size_t free_SV = 0;
576 //~ std::size_t bounded_SV = 0;
577 //~ double objective = -0.5 * shark::blas::inner_prod(m_weights, m_weights);
578 //~ for (std::size_t i=0; i<ell; i++)
579 //~ {
580 //~ double a = m_alpha(i);
581 //~ if (a > 0.0)
582 //~ {
583 //~ objective += a;
584 //~ objective -= reg/2.0 * a * a;
585 //~ if (a == bound) bounded_SV++;
586 //~ else free_SV++;
587 //~ }
588 //~ }
589
590 //~ // return solution statistics
591 //~ if (prop != NULL)
592 //~ {
593 //~ prop->accuracy = max_violation; // this is approximate, but a good guess
594 //~ prop->iterations = ell * epoch;
595 //~ prop->value = objective;
596 //~ prop->seconds = timer.lastLap();
597 //~ }
598
599 //~ // output solution statistics
600 //~ if (verbose)
601 //~ {
602 //~ std::cout << std::endl;
603 //~ std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
604 //~ std::cout << "number of epochs: " << epoch << std::endl;
605 //~ std::cout << "number of iterations: " << (ell * epoch) << std::endl;
606 //~ std::cout << "number of non-zero steps: " << steps << std::endl;
607 //~ std::cout << "dual accuracy: " << max_violation << std::endl;
608 //~ std::cout << "dual objective value: " << objective << std::endl;
609 //~ std::cout << "number of free support vectors: " << free_SV << std::endl;
610 //~ std::cout << "number of bounded support vectors: " << bounded_SV << std::endl;
611 //~ }
612
613 //~ // return the solution
614 //~ return m_weights;
615 //~ }
616
617//~ protected:
618 //~ /// \brief Data structure for sparse vectors.
619 //~ struct SparseVector
620 //~ {
621 //~ std::size_t index;
622 //~ double value;
623 //~ };
624
625 //~ /// \brief Famous "axpy" product, here adding a multiple of a sparse vector to a dense one.
626 //~ static inline void axpy(RealVector& m_weights, double m_alpha, const SparseVector* xi)
627 //~ {
628 //~ while (true)
629 //~ {
630 //~ if (xi->index == (std::size_t)-1) return;
631 //~ m_weights[xi->index] += m_alpha * xi->value;
632 //~ xi++;
633 //~ }
634 //~ }
635
636 //~ /// \brief Inner product between a dense and a sparse vector.
637 //~ static inline double inner_prod(RealVector const& m_weights, const SparseVector* xi)
638 //~ {
639 //~ double ret = 0.0;
640 //~ while (true)
641 //~ {
642 //~ if (xi->index == (std::size_t)-1) return ret;
643 //~ ret += m_weights[xi->index] * xi->value;
644 //~ xi++;
645 //~ }
646 //~ }
647
648 //~ std::vector<SparseVector> storage; ///< storage for sparse vectors
649 //~ std::vector<SparseVector*> x; ///< sparse vectors
650 //~ RealVector y; ///< +1/-1 labels
651 //~ RealVector diagonal; ///< diagonal entries of the quadratic matrix
652 //~ std::size_t m_dim; ///< input space dimension
653//~ };
654
655
656}
657#endif