CSvmTrainer.h
Go to the documentation of this file.
1#ifndef SHARK_ALGORITHMS_CSVMTRAINER_H
2#define SHARK_ALGORITHMS_CSVMTRAINER_H
3
4
16
17//for MCSVMs!
21//~ #include <shark/Algorithms/Trainers/McSvm/McSvmMMRTrainer.h>
22//~ #include <shark/Algorithms/Trainers/McSvm/McReinforcedSvmTrainer.h>
23
24namespace shark {
25
26
27enum class McSvm{
28 WW,
29 CS,
30 LLW,
31 ATM,
32 ATS,
33 ADM,
34 OVA,
35 MMR,
37};
38
39
40///
41/// \brief Training of C-SVMs for binary classification.
42///
43/// The C-SVM is the "standard" support vector machine for
44/// binary (two-class) classification. Given are data tuples
45/// \f$ (x_i, y_i) \f$ with x-component denoting input and
46/// y-component denoting the label +1 or -1 (see the tutorial on
47/// label conventions; the implementation uses values 0/1),
48/// a kernel function k(x, x') and a regularization
49/// constant C > 0. Let H denote the kernel induced
50/// reproducing kernel Hilbert space of k, and let \f$ \phi \f$
51/// denote the corresponding feature map.
52/// Then the SVM classifier is the function
53/// \f[
54/// h(x) = \mathop{sign} (f(x))
55/// \f]
56/// \f[
57/// f(x) = \langle w, \phi(x) \rangle + b
58/// \f]
59/// with coefficients w and b given by the (primal)
60/// optimization problem
61/// \f[
62/// \min \frac{1}{2} \|w\|^2 + C \sum_i L(y_i, f(x_i)),
63/// \f]
64/// where \f$ L(y, f(x)) = \max\{0, 1 - y \cdot f(x)\} \f$
65/// denotes the hinge loss.
66///
67/// For details refer to the paper:<br/>
68/// <p>Support-Vector Networks. Corinna Cortes and Vladimir Vapnik,
69/// Machine Learning, vol. 20 (1995), pp. 273-297.</p>
70/// or simply to the Wikipedia article:<br/>
71/// http://en.wikipedia.org/wiki/Support_vector_machine
72/// \ingroup supervised_trainer
73template <class InputType, class CacheType = float>
75 InputType, unsigned int,
76 KernelClassifier<InputType>,
77 AbstractWeightedTrainer<KernelClassifier<InputType> >
78>
79{
80private:
81 typedef AbstractSvmTrainer<
82 InputType, unsigned int,
85 > base_type;
86public:
87
88 /// \brief Convenience typedefs:
89 /// this and many of the below typedefs build on the class template type CacheType.
90 /// Simply changing that one template parameter CacheType thus allows to flexibly
91 /// switch between using float or double as type for caching the kernel values.
92 /// The default is float, offering sufficient accuracy in the vast majority
93 /// of cases, at a memory cost of only four bytes. However, the template
94 /// parameter makes it easy to use double instead, (e.g., in case high
95 /// accuracy training is needed).
96 typedef CacheType QpFloatType;
97
99
100 //! Constructor
101 //! \param kernel kernel function to use for training and prediction
102 //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
103 //! \param offset whether to train the svm with offset term
104 //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
105 CSvmTrainer(KernelType* kernel, double C, bool offset, bool unconstrained = false)
106 : base_type(kernel, C, offset, unconstrained), m_computeDerivative(false), m_McSvmType(McSvm::WW) //make Vapnik happy!
107 { }
108
109 //! Constructor
110 //! \param kernel kernel function to use for training and prediction
111 //! \param negativeC regularization parameter of the negative class (label 0)
112 //! \param positiveC regularization parameter of the positive class (label 1)
113 //! \param offset whether to train the svm with offset term
114 //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
115 CSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool offset, bool unconstrained = false)
116 : base_type(kernel,negativeC, positiveC, offset, unconstrained), m_computeDerivative(false), m_McSvmType(McSvm::WW) //make Vapnik happy!
117 { }
118
119 /// \brief From INameable: return the class name.
120 std::string name() const
121 { return "CSvmTrainer"; }
122
123 void setComputeBinaryDerivative(bool compute){
124 m_computeDerivative = compute;
125 }
126
127 /// \brief sets the type of the multi-class svm used
128 void setMcSvmType(McSvm type){
129 m_McSvmType = type;
130 }
131
132
133 /// \brief Train the C-SVM.
135 {
136 std::size_t classes = numberOfClasses(dataset);
137 std::size_t ell = dataset.numberOfElements();
138 if(classes == 2){
139 // prepare model
140
141 auto& f = svm.decisionFunction();
142 if (f.basis() == dataset.inputs() && f.kernel() == base_type::m_kernel && f.alpha().size1() == ell && f.alpha().size2() == 1) {
143 // warm start, keep the alphas (possibly clipped)
144 if (this->m_trainOffset) f.offset() = RealVector(1);
145 }
146 else {
147 f.setStructure(base_type::m_kernel, dataset.inputs(), this->m_trainOffset);
148 }
149
150 //dispatch to use the optimal implementation and solve the problem
151 trainBinary(f,dataset);
152
154 f.sparsify();
155 return;
156 }
157
158 //special case OVA
159 if(m_McSvmType == McSvm::OVA){
160 trainOVA(svm,dataset);
161 return;
162 }
163
164 //general multiclass case: find correct dual formulation
165 bool sumToZero = false;
166 bool simplex = false;
169
170 switch (m_McSvmType){
171 case McSvm::WW:
172 sumToZero = false;
173 simplex = false;
174 setupMcParametersWWCS(nu,M, classes);
175 break;
176 case McSvm::CS:
177 sumToZero = false;
178 simplex=true;
179 setupMcParametersWWCS(nu,M, classes);
180 break;
181 case McSvm::LLW:
182 sumToZero=true;
183 simplex = false;
184 setupMcParametersADMLLW(nu,M, classes);
185 break;
186 case McSvm::ATM:
187 sumToZero=true;
188 simplex=true;
189 setupMcParametersATMATS(nu,M, classes);
190 break;
191 case McSvm::ATS:
192 sumToZero=true;
193 simplex = false;
194 setupMcParametersATMATS(nu,M, classes);
195 break;
196 case McSvm::ADM:
197 sumToZero=true;
198 simplex=true;
199 setupMcParametersADMLLW(nu,M, classes);
200 break;
202 sumToZero = false;
203 simplex = false;
204 setupMcParametersATMATS(nu,M, classes);
205 break;
206 case McSvm::MMR:
207 sumToZero = true;
208 simplex = true;
209 setupMcParametersMMR(nu,M, classes);
210 break;
211 case McSvm::OVA: // handle OVA is switch statement to silence compiler warning
212 break;
213 }
214
215 //setup linear part
216 RealMatrix linear(ell,M.width(),1.0);
217 if(m_McSvmType == McSvm::ReinforcedSvm){
218 auto const& labels = dataset.labels();
219 std::size_t i=0;
220 for(unsigned int y: labels.elements()){
221 linear(i, y) = classes - 1.0; // self-margin target value of reinforced SVM loss
222 i++;
223 }
224 }
225
226 //solve dual
227 RealMatrix alpha(ell,M.width(),0.0);
228 RealVector bias(classes,0.0);
229 if(simplex)
230 solveMcSimplex(sumToZero,nu,M,linear,alpha,bias,dataset);
231 else
232 solveMcBox(sumToZero,nu,M,linear,alpha,bias,dataset);
233
234
235 // write the solution into the model
236 svm.decisionFunction().setStructure(this->m_kernel,dataset.inputs(),this->m_trainOffset,classes);
237
238 for (std::size_t i=0; i<ell; i++)
239 {
240 unsigned int y = dataset.element(i).label;
241 for (std::size_t c=0; c<classes; c++)
242 {
243 double sum = 0.0;
244 std::size_t r = alpha.size2() * y;
245 for (std::size_t p=0; p != alpha.size2(); p++, r++)
246 sum += nu(r, c) * alpha(i, p);
247 svm.decisionFunction().alpha(i,c) = sum;
248 }
249 }
250
251 if (this->m_trainOffset)
252 svm.decisionFunction().offset() = bias;
253
254 if (this->sparsify())
255 svm.decisionFunction().sparsify();
256 }
257
258 /// \brief Train the C-SVM using weights.
260 SHARK_RUNTIME_CHECK(numberOfClasses(dataset) == 2, "CSVM with weights is only implemented for binary problems");
261 // prepare model
262 std::size_t n = dataset.numberOfElements();
263 auto& f = svm.decisionFunction();
264 if (f.basis() == dataset.inputs() && f.kernel() == base_type::m_kernel && f.alpha().size1() == n && f.alpha().size2() == 1) {
265 // warm start, keep the alphas
266 if (this->m_trainOffset) f.offset() = RealVector(1);
267 else f.offset() = RealVector();
268 }
269 else {
270 f.setStructure(base_type::m_kernel, dataset.inputs(), this->m_trainOffset);
271 }
272
273 //dispatch to use the optimal implementation and solve the problem
274 trainBinary(f, dataset);
275
276 if (base_type::sparsify()) f.sparsify();
277 }
278
279 RealVector const& get_db_dParams()const{
280 return m_db_dParams;
281 }
282
283private:
284
285 void solveMcSimplex(
286 bool sumToZero, QpSparseArray<QpFloatType> const& nu,QpSparseArray<QpFloatType> const& M, RealMatrix const& linear,
287 RealMatrix& alpha, RealVector& bias,
289 ){
290 typedef KernelMatrix<InputType, QpFloatType> KernelMatrixType;
291 typedef CachedMatrix< KernelMatrixType > CachedMatrixType;
292 typedef PrecomputedMatrix< KernelMatrixType > PrecomputedMatrixType;
293
294 KernelMatrixType km(*base_type::m_kernel, dataset.inputs());
295 // solve the problem
297 {
298 PrecomputedMatrixType matrix(&km);
299 QpMcSimplexDecomp< PrecomputedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
301 problem.setShrinking(base_type::m_shrinking);
302 if(this->m_trainOffset){
303 BiasSolverSimplex<PrecomputedMatrixType> biasSolver(&problem);
304 biasSolver.solve(bias,base_type::m_stoppingcondition,nu,sumToZero, &prop);
305 }
306 else{
307 QpSolver<QpMcSimplexDecomp< PrecomputedMatrixType> > solver(problem);
308 solver.solve( base_type::m_stoppingcondition, &prop);
309 }
310 alpha = problem.solution();
311 }
312 else
313 {
314 CachedMatrixType matrix(&km, base_type::m_cacheSize);
315 QpMcSimplexDecomp< CachedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
316 QpSolutionProperties& prop = base_type::m_solutionproperties;
317 problem.setShrinking(base_type::m_shrinking);
318 if(this->m_trainOffset){
319 BiasSolverSimplex<CachedMatrixType> biasSolver(&problem);
320 biasSolver.solve(bias,base_type::m_stoppingcondition,nu,sumToZero, &prop);
321 }
322 else{
323 QpSolver<QpMcSimplexDecomp< CachedMatrixType> > solver(problem);
324 solver.solve( base_type::m_stoppingcondition, &prop);
325 }
326 alpha = problem.solution();
327 }
328 base_type::m_accessCount = km.getAccessCount();
329 }
330
331 void solveMcBox(
332 bool sumToZero, QpSparseArray<QpFloatType> const& nu,QpSparseArray<QpFloatType> const& M, RealMatrix const& linear,
333 RealMatrix& alpha, RealVector& bias,
334 LabeledData<InputType, unsigned int> const& dataset
335 ){
336 typedef KernelMatrix<InputType, QpFloatType> KernelMatrixType;
337 typedef CachedMatrix< KernelMatrixType > CachedMatrixType;
338 typedef PrecomputedMatrix< KernelMatrixType > PrecomputedMatrixType;
339
340 KernelMatrixType km(*base_type::m_kernel, dataset.inputs());
341 // solve the problem
343 {
344 PrecomputedMatrixType matrix(&km);
345 QpMcBoxDecomp< PrecomputedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
346 QpSolutionProperties& prop = base_type::m_solutionproperties;
347 problem.setShrinking(base_type::m_shrinking);
348 if(this->m_trainOffset){
349 BiasSolver<PrecomputedMatrixType> biasSolver(&problem);
350 biasSolver.solve(bias,base_type::m_stoppingcondition,nu, sumToZero, &prop);
351 }
352 else{
353 QpSolver<QpMcBoxDecomp< PrecomputedMatrixType> > solver(problem);
354 solver.solve( base_type::m_stoppingcondition, &prop);
355 }
356 alpha = problem.solution();
357 }
358 else
359 {
360 CachedMatrixType matrix(&km, base_type::m_cacheSize);
361 QpMcBoxDecomp< CachedMatrixType> problem(matrix, M, dataset.labels(), linear, this->C());
362 QpSolutionProperties& prop = base_type::m_solutionproperties;
363 problem.setShrinking(base_type::m_shrinking);
364 if(this->m_trainOffset){
365 BiasSolver<CachedMatrixType> biasSolver(&problem);
366 biasSolver.solve(bias,base_type::m_stoppingcondition,nu, sumToZero, &prop);
367 }
368 else{
369 QpSolver<QpMcBoxDecomp< CachedMatrixType> > solver(problem);
370 solver.solve( base_type::m_stoppingcondition, &prop);
371 }
372 alpha = problem.solution();
373 }
374 base_type::m_accessCount = km.getAccessCount();
375 }
376
377 template<class Trainer>
378 void trainMc(KernelClassifier<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset){
379 Trainer trainer(base_type::m_kernel,this->C(),this->m_trainOffset);
380 trainer.stoppingCondition() = this->stoppingCondition();
381 trainer.precomputeKernel() = this->precomputeKernel();
382 trainer.sparsify() = this->sparsify();
383 trainer.shrinking() = this->shrinking();
384 trainer.s2do() = this->s2do();
385 trainer.verbosity() = this->verbosity();
386 trainer.setCacheSize(this->cacheSize());
387 trainer.train(svm,dataset);
388 this->solutionProperties() = trainer.solutionProperties();
389 base_type::m_accessCount = trainer.accessCount();
390 }
391
392 void setupMcParametersWWCS(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
393 nu.resize(classes * (classes-1), classes, 2*classes*(classes-1));
394 for (unsigned int r=0, y=0; y<classes; y++)
395 {
396 for (unsigned int p=0, pp=0; p<classes-1; p++, pp++, r++)
397 {
398 if (pp == y) pp++;
399 if (y < pp)
400 {
401 nu.add(r, y, 0.5);
402 nu.add(r, pp, -0.5);
403 }
404 else
405 {
406 nu.add(r, pp, -0.5);
407 nu.add(r, y, 0.5);
408 }
409 }
410 }
411
412 M.resize(classes * (classes-1) * classes, classes-1, 2 * classes * (classes-1) * (classes-1));
413 for (unsigned int r=0, yv=0; yv<classes; yv++)
414 {
415 for (unsigned int pv=0, ppv=0; pv<classes-1; pv++, ppv++)
416 {
417 if (ppv == yv) ppv++;
418 for (unsigned int yw=0; yw<classes; yw++, r++)
419 {
420 QpFloatType baseM = (yv == yw ? (QpFloatType)0.25 : (QpFloatType)0.0) - (ppv == yw ? (QpFloatType)0.25 : (QpFloatType)0.0);
421 M.setDefaultValue(r, baseM);
422 if (yv == yw)
423 {
424 M.add(r, ppv - (ppv >= yw ? 1 : 0), baseM + (QpFloatType)0.25);
425 }
426 else if (ppv == yw)
427 {
428 M.add(r, yv - (yv >= yw ? 1 : 0), baseM - (QpFloatType)0.25);
429 }
430 else
431 {
432 unsigned int pw = ppv - (ppv >= yw ? 1 : 0);
433 unsigned int pw2 = yv - (yv >= yw ? 1 : 0);
434 if (pw < pw2)
435 {
436 M.add(r, pw, baseM + (QpFloatType)0.25);
437 M.add(r, pw2, baseM - (QpFloatType)0.25);
438 }
439 else
440 {
441 M.add(r, pw2, baseM - (QpFloatType)0.25);
442 M.add(r, pw, baseM + (QpFloatType)0.25);
443 }
444 }
445 }
446 }
447 }
448 }
449
450 void setupMcParametersATMATS(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
451 nu.resize(classes*classes, classes, classes*classes);
452 for (unsigned int r=0, y=0; y<classes; y++)
453 {
454 for (unsigned int p=0; p<classes; p++, r++)
455 {
456 nu.add(r, p, (QpFloatType)((p == y) ? 1.0 : -1.0));
457 }
458 }
459
460 M.resize(classes * classes * classes, classes, 2 * classes * classes * classes);
461 QpFloatType c_ne = (QpFloatType)(-1.0 / (double)classes);
462 QpFloatType c_eq = (QpFloatType)1.0 + c_ne;
463 for (unsigned int r=0, yv=0; yv<classes; yv++)
464 {
465 for (unsigned int pv=0; pv<classes; pv++)
466 {
467 QpFloatType sign = QpFloatType((yv == pv) ? -1 : 1);//cast to keep MSVC happy...
468 for (unsigned int yw=0; yw<classes; yw++, r++)
469 {
470 M.setDefaultValue(r, sign * c_ne);
471 if (yw == pv)
472 {
473 M.add(r, pv, -sign * c_eq);
474 }
475 else
476 {
477 M.add(r, pv, sign * c_eq);
478 M.add(r, yw, -sign * c_ne);
479 }
480 }
481 }
482 }
483 }
484
485 void setupMcParametersADMLLW(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
486 nu.resize(classes * (classes-1), classes, classes*(classes-1));
487 for (unsigned int r=0, y=0; y<classes; y++)
488 {
489 for (unsigned int p=0, pp=0; p<classes-1; p++, pp++, r++)
490 {
491 if (pp == y) pp++;
492 nu.add(r, pp, (QpFloatType)-1.0);
493 }
494 }
495
496 M.resize(classes * (classes-1) * classes, classes-1, classes * (classes-1) * (classes-1));
497 QpFloatType mood = (QpFloatType)(-1.0 / (double)classes);
498 QpFloatType val = (QpFloatType)1.0 + mood;
499 for (unsigned int r=0, yv=0; yv<classes; yv++)
500 {
501 for (unsigned int pv=0, ppv=0; pv<classes-1; pv++, ppv++)
502 {
503 if (ppv == yv) ppv++;
504 for (unsigned int yw=0; yw<classes; yw++, r++)
505 {
506 M.setDefaultValue(r, mood);
507 if (ppv != yw)
508 {
509 unsigned int pw = ppv - (ppv > yw ? 1 : 0);
510 M.add(r, pw, val);
511 }
512 }
513 }
514 }
515 }
516
517 void setupMcParametersMMR(QpSparseArray<QpFloatType>& nu,QpSparseArray<QpFloatType>& M, std::size_t classes)const{
518 nu.resize(classes, classes, classes);
519 for (unsigned int y=0; y<classes; y++)
520 nu.add(y, y, 1.0);
521
522 M.resize(classes * classes, 1, classes);
523 QpFloatType mood = (QpFloatType)(-1.0 / (double)classes);
524 QpFloatType val = (QpFloatType)1.0 + mood;
525 for (unsigned int r=0, yv=0; yv<classes; yv++)
526 {
527 for (unsigned int yw=0; yw<classes; yw++, r++)
528 {
529 M.setDefaultValue(r, mood);
530 if (yv == yw) M.add(r, 0, val);
531 }
532 }
533 }
534
535 void trainOVA(KernelClassifier<InputType>& svm, const LabeledData<InputType, unsigned int>& dataset){
536 std::size_t classes = numberOfClasses(dataset);
537 svm.decisionFunction().setStructure(this->m_kernel,dataset.inputs(),this->m_trainOffset,classes);
538
544 for (unsigned int c=0; c<classes; c++)
545 {
546 LabeledData<InputType, unsigned int> bindata = oneVersusRestProblem(dataset, c);
547 KernelClassifier<InputType> binsvm;
548// TODO: maybe build the Quadratic programs directly,
549// in order to profit from cached and
550// in particular from precomputed kernel
551// entries!
552 CSvmTrainer<InputType, QpFloatType> bintrainer(base_type::m_kernel, this->C(),this->m_trainOffset);
553 bintrainer.setCacheSize(this->cacheSize());
554 bintrainer.sparsify() = false;
555 bintrainer.stoppingCondition() = base_type::stoppingCondition();
556 bintrainer.precomputeKernel() = base_type::precomputeKernel(); // sub-optimal!
557 bintrainer.shrinking() = base_type::shrinking();
558 bintrainer.s2do() = base_type::s2do();
559 bintrainer.verbosity() = base_type::verbosity();
560 bintrainer.train(binsvm, bindata);
561 base_type::m_solutionproperties.iterations += bintrainer.solutionProperties().iterations;
562 base_type::m_solutionproperties.seconds += bintrainer.solutionProperties().seconds;
563 base_type::m_solutionproperties.accuracy = std::max(base_type::solutionProperties().accuracy, bintrainer.solutionProperties().accuracy);
564 column(svm.decisionFunction().alpha(), c) = column(binsvm.decisionFunction().alpha(), 0);
565 if (this->m_trainOffset)
566 svm.decisionFunction().offset(c) = binsvm.decisionFunction().offset(0);
567 base_type::m_accessCount += bintrainer.accessCount();
568 }
569
570 if (base_type::sparsify())
571 svm.decisionFunction().sparsify();
572 }
573
574 //by default the normal unoptimized kernel matrix is used
575 template<class T, class DatasetTypeT>
576 void trainBinary(KernelExpansion<T>& svm, DatasetTypeT const& dataset){
577 KernelMatrix<T, QpFloatType> km(*base_type::m_kernel, dataset.inputs());
578 trainBinary(km,svm,dataset);
579 }
580
581 //in the case of a gaussian kernel and sparse vectors, we can use an optimized approach
582 template<class T, class DatasetTypeT>
583 void trainBinary(KernelExpansion<CompressedRealVector>& svm, DatasetTypeT const& dataset){
584 //check whether a gaussian kernel is used
585 typedef GaussianRbfKernel<CompressedRealVector> Gaussian;
586 Gaussian const* kernel = dynamic_cast<Gaussian const*> (base_type::m_kernel);
587 if(kernel != 0){//jep, use optimized kernel matrix
588 GaussianKernelMatrix<CompressedRealVector,QpFloatType> km(kernel->gamma(),dataset.inputs());
589 trainBinary(km,svm,dataset);
590 }
591 else{
592 KernelMatrix<CompressedRealVector, QpFloatType> km(*base_type::m_kernel, dataset.inputs());
593 trainBinary(km,svm,dataset);
594 }
595 }
596
597 //create the problem for the unweighted datasets
598 template<class Matrix, class T>
599 void trainBinary(Matrix& km, KernelExpansion<T>& svm, LabeledData<T, unsigned int> const& dataset){
601 {
602 PrecomputedMatrix<Matrix> matrix(&km);
603 CSVMProblem<PrecomputedMatrix<Matrix> > svmProblem(matrix,dataset.labels(),base_type::m_regularizers);
604 optimize(svm,svmProblem,dataset);
605 }
606 else
607 {
608 CachedMatrix<Matrix> matrix(&km);
609 CSVMProblem<CachedMatrix<Matrix> > svmProblem(matrix,dataset.labels(),base_type::m_regularizers);
610 optimize(svm,svmProblem,dataset);
611 }
612 base_type::m_accessCount = km.getAccessCount();
613 }
614
615 // create the problem for the weighted datasets
616 template<class Matrix, class T>
617 void trainBinary(Matrix& km, KernelExpansion<T>& svm, WeightedLabeledData<T, unsigned int> const& dataset){
619 {
620 PrecomputedMatrix<Matrix> matrix(&km);
621 GeneralQuadraticProblem<PrecomputedMatrix<Matrix> > svmProblem(
622 matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
623 );
624 optimize(svm,svmProblem,dataset.data());
625 }
626 else
627 {
628 CachedMatrix<Matrix> matrix(&km);
629 GeneralQuadraticProblem<CachedMatrix<Matrix> > svmProblem(
630 matrix,dataset.labels(),dataset.weights(),base_type::m_regularizers
631 );
632 optimize(svm,svmProblem,dataset.data());
633 }
634 base_type::m_accessCount = km.getAccessCount();
635 }
636
637 template<class SVMProblemType>
638 void optimize(KernelExpansion<InputType>& svm, SVMProblemType& svmProblem, LabeledData<InputType, unsigned int> const& dataset){
639 if (this->m_trainOffset)
640 {
641 typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
642 ProblemType problem(svmProblem,base_type::m_shrinking);
643 QpSolver< ProblemType > solver(problem);
644 // truncate the existing solution to the bounds
645 RealVector const& reg = this->regularizationParameters();
646 double C_minus = reg(0);
647 double C_plus = (reg.size() == 1) ? reg(0) : reg(1);
648 std::size_t i=0;
649 for (auto label : dataset.labels().elements()) {
650 double a = svm.alpha()(i, 0);
651 if (label == 0) a = std::max(std::min(a, 0.0), -C_minus);
652 else a = std::min(std::max(a, 0.0), C_plus);
653 svm.alpha()(i, 0) = a;
654 i++;
655 }
656 problem.setInitialSolution(blas::column(svm.alpha(), 0));
658 column(svm.alpha(),0)= problem.getUnpermutedAlpha();
659 svm.offset(0) = computeBias(problem,dataset);
660 }
661 else
662 {
663 typedef BoxConstrainedShrinkingProblem<SVMProblemType> ProblemType;
664 ProblemType problem(svmProblem,base_type::m_shrinking);
665 QpSolver< ProblemType> solver(problem);
666 // truncate the existing solution to the bounds
667 RealVector const& reg = this->regularizationParameters();
668 double C_minus = reg(0);
669 double C_plus = (reg.size() == 1) ? reg(0) : reg(1);
670 std::size_t i=0;
671 for (auto label : dataset.labels().elements()) {
672 double a = svm.alpha()(i, 0);
673 if (label == 0) a = std::max(std::min(a, 0.0), -C_minus);
674 else a = std::min(std::max(a, 0.0), C_plus);
675 svm.alpha()(i, 0) = a;
676 i++;
677 }
678 problem.setInitialSolution(blas::column(svm.alpha(), 0));
680 column(svm.alpha(),0) = problem.getUnpermutedAlpha();
681 }
682 }
683
684 RealVector m_db_dParams; ///< in the rare case that there are only bounded SVs and no free SVs, this will hold the derivative of b w.r.t. the hyperparameters. Derivative w.r.t. C is last.
685
686 bool m_computeDerivative;
687 McSvm m_McSvmType;
688
689 template<class Problem>
690 double computeBias(Problem const& problem, LabeledData<InputType, unsigned int> const& dataset){
691 std::size_t nkp = base_type::m_kernel->numberOfParameters();
692 m_db_dParams.resize(nkp+1);
693 m_db_dParams.clear();
694
695 std::size_t ell = problem.dimensions();
696 if (ell == 0) return 0.0;
697
698 // compute the offset from the KKT conditions
699 double lowerBound = -1e100;
700 double upperBound = 1e100;
701 double sum = 0.0;
702 std::size_t freeVars = 0;
703 std::size_t lower_i = 0;
704 std::size_t upper_i = 0;
705 for (std::size_t i=0; i<ell; i++)
706 {
707 double value = problem.gradient(i);
708 if (problem.alpha(i) == problem.boxMin(i))
709 {
710 if (value > lowerBound) { //in case of no free SVs, we are looking for the largest gradient of all alphas at the lower bound
711 lowerBound = value;
712 lower_i = i;
713 }
714 }
715 else if (problem.alpha(i) == problem.boxMax(i))
716 {
717 if (value < upperBound) { //in case of no free SVs, we are looking for the smallest gradient of all alphas at the upper bound
718 upperBound = value;
719 upper_i = i;
720 }
721 }
722 else
723 {
724 sum += value;
725 freeVars++;
726 }
727 }
728 if (freeVars > 0)
729 return sum / freeVars; //stabilized (averaged) exact value
730
731 if(!m_computeDerivative)
732 return 0.5 * (lowerBound + upperBound); //best estimate
733
734 lower_i = problem.permutation(lower_i);
735 upper_i = problem.permutation(upper_i);
736
737 SHARK_RUNTIME_CHECK(base_type::m_regularizers.size() == 1, "derivative only implemented for SVM with one C" );
738
739 // We next compute the derivative of lowerBound and upperBound wrt C, in order to then get that of b wrt C.
740 // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
741 double dlower_dC = 0.0;
742 double dupper_dC = 0.0;
743 // At the same time, we also compute the derivative of lowerBound and upperBound wrt the kernel parameters.
744 // The equation at the foundation of this simply is g_i = y_i - \sum_j \alpha_j K_{ij} .
745 RealVector dupper_dkernel( nkp,0 );
746 RealVector dlower_dkernel( nkp,0 );
747 //state for eval and evalDerivative of the kernel
748 boost::shared_ptr<State> kernelState = base_type::m_kernel->createState();
749 RealVector der(nkp ); //derivative storage helper
750 //todo: O.K.: here kernel single input derivative would be usefull
751 //also it can be usefull to use here real batch processing and use batches of size 1 for lower /upper
752 //and instead of singleInput whole batches.
753 //what we do is, that we use the batched input versions with batches of size one.
754 typename Batch<InputType>::type singleInput = Batch<InputType>::createBatch( dataset.element(0).input, 1 );
755 typename Batch<InputType>::type lowerInput = Batch<InputType>::createBatch( dataset.element(lower_i).input, 1 );
756 typename Batch<InputType>::type upperInput = Batch<InputType>::createBatch( dataset.element(upper_i).input, 1 );
757 getBatchElement( lowerInput, 0 ) = dataset.element(lower_i).input; //copy the current input into the batch
758 getBatchElement( upperInput, 0 ) = dataset.element(upper_i).input; //copy the current input into the batch
759 RealMatrix one(1,1,1); //weight of input
760 RealMatrix result(1,1); //stores the result of the call
761
762 for (std::size_t i=0; i<ell; i++) {
763 double cur_alpha = problem.alpha(problem.permutation(i));
764 if ( cur_alpha != 0 ) {
765 int cur_label = ( cur_alpha>0.0 ? 1 : -1 );
766 getBatchElement( singleInput, 0 ) = dataset.element(i).input; //copy the current input into the batch
767 // treat contributions of largest gradient at lower bound
768 base_type::m_kernel->eval( lowerInput, singleInput, result, *kernelState );
769 dlower_dC += cur_label * result(0,0);
770 base_type::m_kernel->weightedParameterDerivative( lowerInput, singleInput,one, *kernelState, der );
771 for ( std::size_t k=0; k<nkp; k++ ) {
772 dlower_dkernel(k) += cur_label * der(k);
773 }
774 // treat contributions of smallest gradient at upper bound
775 base_type::m_kernel->eval( upperInput, singleInput,result, *kernelState );
776 dupper_dC += cur_label * result(0,0);
777 base_type::m_kernel->weightedParameterDerivative( upperInput, singleInput, one, *kernelState, der );
778 for ( std::size_t k=0; k<nkp; k++ ) {
779 dupper_dkernel(k) += cur_label * der(k);
780 }
781 }
782 }
783 // assign final values to derivative of b wrt hyperparameters
784 m_db_dParams( nkp ) = -0.5 * ( dlower_dC + dupper_dC );
785 for ( std::size_t k=0; k<nkp; k++ ) {
786 m_db_dParams(k) = -0.5 * this->C() * ( dlower_dkernel(k) + dupper_dkernel(k) );
787 }
789 m_db_dParams( nkp ) *= this->C();
790 }
791
792 return 0.5 * (lowerBound + upperBound); //best estimate
793 }
794};
795
796
797template <class InputType>
799{
800public:
802
803 LinearCSvmTrainer(double C, bool offset, bool unconstrained = false)
804 : AbstractLinearSvmTrainer<InputType>(C, offset, unconstrained){}
805
806 /// \brief From INameable: return the class name.
807 std::string name() const
808 { return "LinearCSvmTrainer"; }
809
810 /// \brief sets the type of the multi-class svm used
811 void setMcSvmType(McSvm type){
812 m_McSvmType = type;
813 }
814
816 {
817 std::size_t classes = numberOfClasses(dataset);
818 if(classes == 2){
819 trainBinary(model,dataset);
820 return;
821 }
822 switch (m_McSvmType){
823 case McSvm::WW:
824 trainMc<QpMcLinearWW<InputType> >(model,dataset,classes);
825 break;
826 case McSvm::CS:
827 trainMc<QpMcLinearCS<InputType> >(model,dataset,classes);
828 break;
829 case McSvm::LLW:
830 trainMc<QpMcLinearLLW<InputType> >(model,dataset,classes);
831 break;
832 case McSvm::ATM:
833 trainMc<QpMcLinearATM<InputType> >(model,dataset,classes);
834 break;
835 case McSvm::ATS:
836 trainMc<QpMcLinearATS<InputType> >(model,dataset,classes);
837 break;
838 case McSvm::ADM:
839 trainMc<QpMcLinearADM<InputType> >(model,dataset,classes);
840 break;
841 case McSvm::MMR:
842 trainMc<QpMcLinearMMR<InputType> >(model,dataset,classes);
843 break;
845 trainMc<QpMcLinearReinforced<InputType> >(model,dataset,classes);
846 break;
847 case McSvm::OVA://OVA is a special case and implemented here
848 trainOVA(model,dataset,classes);
849 break;
850 }
851 }
852private:
853 McSvm m_McSvmType;
854
855 void trainBinary(LinearClassifier<InputType>& model, LabeledData<InputType, unsigned int> const& dataset)
856 {
857 std::size_t dim = inputDimension(dataset);
858 QpBoxLinear<InputType> solver(dataset, dim);
859 solver.solve(
860 base_type::C(),
861 0.0,
864 QpConfig::verbosity() > 0);
865
866 if(!this->trainOffset()){
867 RealMatrix w(1, dim, 0.0);
868 row(w,0) = solver.solutionWeightVector();
869 model.decisionFunction().setStructure(w);
870 return;
871 }
872
873 double offset = 0;
874 double stepSize = 0.1;
875 double grad = solver.offsetGradient();
876 while(stepSize > 0.1*QpConfig::stoppingCondition().minAccuracy){
877 offset+= (grad < 0? -stepSize:stepSize);
878 solver.setOffset(offset);
879 solver.solve(
880 base_type::C(),
881 0.0,
884 QpConfig::verbosity() > 0);
885 double newGrad = solver.offsetGradient();
886 if(newGrad == 0)
887 break;
888 if(newGrad*grad < 0)
889 stepSize *= 0.5;
890 else
891 stepSize *= 1.6;
892 grad = newGrad;
893 }
894
895 RealMatrix w(1, dim, 0.0);
896 noalias(row(w,0)) = solver.solutionWeightVector();
897 model.decisionFunction().setStructure(w,RealVector(1,offset));
898
899 }
900 template<class Solver>
901 void trainMc(LinearClassifier<InputType>& model, LabeledData<InputType, unsigned int> const& dataset, std::size_t classes){
902 std::size_t dim = inputDimension(dataset);
903
904 Solver solver(dataset, dim, classes);
905 RealMatrix w = solver.solve(random::globalRng, this->C(), this->stoppingCondition(), &this->solutionProperties(), this->verbosity() > 0);
906 model.decisionFunction().setStructure(w);
907 }
908
909 void trainOVA(LinearClassifier<InputType>& model, const LabeledData<InputType, unsigned int>& dataset, std::size_t classes)
910 {
916
917 std::size_t dim = inputDimension(dataset);
918 RealMatrix w(classes, dim);
919 for (unsigned int c=0; c<classes; c++)
920 {
921 LabeledData<InputType, unsigned int> bindata = oneVersusRestProblem(dataset, c);
922 QpBoxLinear<InputType> solver(bindata, dim);
923 QpSolutionProperties prop;
924 solver.solve(this->C(), 0.0, base_type::m_stoppingcondition, &prop, base_type::m_verbosity > 0);
925 noalias(row(w, c)) = solver.solutionWeightVector();
929 }
930 model.decisionFunction().setStructure(w);
931 }
932};
933
934
935template <class InputType, class CacheType = float>
936class SquaredHingeCSvmTrainer : public AbstractSvmTrainer<InputType, unsigned int>
937{
938public:
939 typedef CacheType QpFloatType;
940
944
948
949 //! Constructor
950 //! \param kernel kernel function to use for training and prediction
951 //! \param C regularization parameter - always the 'true' value of C, even when unconstrained is set
952 //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver??
953 SquaredHingeCSvmTrainer(KernelType* kernel, double C, bool unconstrained = false)
954 : base_type(kernel, C, unconstrained)
955 { }
956
957 //! Constructor
958 //! \param kernel kernel function to use for training and prediction
959 //! \param negativeC regularization parameter of the negative class (label 0)
960 //! \param positiveC regularization parameter of the positive class (label 1)
961 //! \param unconstrained when a C-value is given via setParameter, should it be piped through the exp-function before using it in the solver?
962 SquaredHingeCSvmTrainer(KernelType* kernel, double negativeC, double positiveC, bool unconstrained = false)
963 : base_type(kernel,negativeC, positiveC, unconstrained)
964 { }
965
966 /// \brief From INameable: return the class name.
967 std::string name() const
968 { return "SquaredHingeCSvmTrainer"; }
969
970 /// \brief Train the C-SVM.
972 {
973 svm.decisionFunction().setStructure(base_type::m_kernel,dataset.inputs(),this->m_trainOffset);
974
975 RealVector diagonalModifier(dataset.numberOfElements(),0.5/base_type::m_regularizers(0));
976 if(base_type::m_regularizers.size() != 1){
977 for(std::size_t i = 0; i != diagonalModifier.size();++i){
978 diagonalModifier(i) = 0.5/base_type::m_regularizers(dataset.element(i).label);
979 }
980 }
981
982 KernelMatrixType km(*base_type::m_kernel, dataset.inputs(),diagonalModifier);
984 {
985 PrecomputedMatrixType matrix(&km);
986 optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
987 }
988 else
989 {
990 CachedMatrixType matrix(&km);
991 optimize(svm.decisionFunction(),matrix,diagonalModifier,dataset);
992 }
994 if (base_type::sparsify()) svm.decisionFunction().sparsify();
995
996 }
997
998private:
999
1000 template<class Matrix>
1001 void optimize(KernelExpansion<InputType>& svm, Matrix& matrix,RealVector const& diagonalModifier, LabeledData<InputType, unsigned int> const& dataset){
1002 typedef CSVMProblem<Matrix> SVMProblemType;
1003 SVMProblemType svmProblem(matrix,dataset.labels(),1e100);
1004 if (this->m_trainOffset)
1005 {
1006 typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
1007 ProblemType problem(svmProblem,base_type::m_shrinking);
1008 QpSolver< ProblemType > solver(problem);
1010 column(svm.alpha(),0)= problem.getUnpermutedAlpha();
1011 //compute the bias
1012 double sum = 0.0;
1013 std::size_t freeVars = 0;
1014 for (std::size_t i=0; i < problem.dimensions(); i++)
1015 {
1016 if(problem.alpha(i) > problem.boxMin(i) && problem.alpha(i) < problem.boxMax(i)){
1017 sum += problem.gradient(i) - problem.alpha(i)*2*diagonalModifier(i);
1018 freeVars++;
1019 }
1020 }
1021 if (freeVars > 0)
1022 svm.offset(0) = sum / freeVars; //stabilized (averaged) exact value
1023 else
1024 svm.offset(0) = 0;
1025 }
1026 else
1027 {
1028 typedef BoxConstrainedShrinkingProblem<SVMProblemType> ProblemType;
1029 ProblemType problem(svmProblem,base_type::m_shrinking);
1030 QpSolver< ProblemType > solver(problem);
1032 column(svm.alpha(),0) = problem.getUnpermutedAlpha();
1033
1034 }
1035 }
1036};
1037
1038
1039template <class InputType>
1041{
1042public:
1044
1045 SquaredHingeLinearCSvmTrainer(double C, bool unconstrained = false)
1046 : AbstractLinearSvmTrainer<InputType>(C, false, unconstrained){}
1047
1048 /// \brief From INameable: return the class name.
1049 std::string name() const
1050 { return "SquaredHingeLinearCSvmTrainer"; }
1051
1053 {
1054 std::size_t dim = inputDimension(dataset);
1055 QpBoxLinear<InputType> solver(dataset, dim);
1056 RealMatrix w(1, dim, 0.0);
1057 solver.solve(
1058 1e100,
1059 1.0 / base_type::C(),
1062 QpConfig::verbosity() > 0);
1063 row(w,0) = solver.solutionWeightVector();
1064 model.decisionFunction().setStructure(w);
1065 }
1066};
1067
1068
1069}
1070#endif