include/shark/Algorithms/QP/QpMcLinear.h Source File

Go to the documentation of this file.
//===========================================================================
/*!
 *
 *
 * \brief       Quadratic programming solvers for linear multi-class SVM training without bias.
 *
 *
 *
 * \author      T. Glasmachers
 * \date        -
 *
 *
 * \par Copyright 1995-2017 Shark Development Team
 *
 * <BR><HR>
 * This file is part of Shark.
 * <https://shark-ml.github.io/Shark/>
 *
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================
 
 
#ifndef SHARK_ALGORITHMS_QP_QPMCLINEAR_H
#define SHARK_ALGORITHMS_QP_QPMCLINEAR_H
 
#include <shark/Core/Timer.h>
#include <shark/Algorithms/QP/QuadraticProgram.h>
#include <shark/Data/Dataset.h>
#include <shark/Data/DataView.h>
#include <shark/LinAlg/Base.h>
#include <cmath>
#include <iostream>
#include <vector>
 
 
namespace shark {
 
 
/// \brief Generic solver skeleton for linear multi-class SVM problems.
template <class InputT>
class QpMcLinear
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
    typedef typename LabeledData<InputT, unsigned int>::const_element_reference ElementType;
    typedef typename Batch<InputT>::const_reference InputReferenceType;
 
    enum CoordinateSelectionStrategy {UNIFORM, ACF};
 
 
    ///
    /// \brief Constructor
    ///
    /// \param  dataset   training data
    /// \param  dim       problem dimension
    /// \param  classes   number of classes in the problem
    /// \param  strategy  coordinate selection strategy
    /// \param  shrinking flag turning shrinking on and off
    ///
    QpMcLinear(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes,
            std::size_t strategy = ACF,
            bool shrinking = false)
    : m_data(dataset)
    , m_xSquared(dataset.numberOfElements())
    , m_dim(dim)
    , m_classes(classes)
    , m_strategy(strategy)
    , m_shrinking(shrinking)
    {
        SHARK_ASSERT(m_dim > 0);
 
        for (std::size_t i=0; i<m_data.size(); i++)
        {
            m_xSquared(i) = inner_prod(m_data[i].input, m_data[i].input);
        }
    }
 
    ///
    /// \brief Solve the SVM training problem.
    ///
    /// \param  rng      random number generator used by the algorithm
    /// \param  C        regularization constant of the SVM
    /// \param  stop     stopping condition(s)
    /// \param  prop     solution properties
    /// \param  verbose  if true, the solver prints status information and solution statistics
    ///
    RealMatrix solve(
        random::rng_type& rng,
        double C,
        QpStoppingCondition& stop,
        QpSolutionProperties* prop = NULL,
        bool verbose = false)
    {
        // sanity checks
        SHARK_ASSERT(C > 0.0);
 
        // measure training time
        Timer timer;
        timer.start();
 
        // prepare dimensions and vectors
        std::size_t ell = m_data.size();             // number of training examples
        RealMatrix alpha(ell, m_classes + 1, 0.0);   // Lagrange multipliers; dual variables. Reserve one extra column.
        RealMatrix w(m_classes, m_dim, 0.0);         // weight vectors; primal variables
 
        // scheduling of steps, for ACF only
        RealVector pref(ell, 1.0);                   // example-wise measure of success
        double prefsum = (double)ell;                // normalization constant
 
        std::vector<std::size_t> schedule(ell);
        if (m_strategy == UNIFORM)
        {
            for (std::size_t i=0; i<ell; i++) schedule[i] = i;
        }
 
        // used for shrinking
        std::size_t active = ell;
 
        // prepare counters
        std::size_t epoch = 0;
        std::size_t steps = 0;
 
        // prepare performance monitoring
        double objective = 0.0;
        double max_violation = 0.0;
 
        // gain for ACF
        const double gain_learning_rate = 1.0 / ell;
        double average_gain = 0.0;
 
 
        // outer optimization loop (epochs)
        bool canstop = true;
        while (true)
        {
            if (m_strategy == ACF)
            {
                // define schedule
                double psum = prefsum;
                prefsum = 0.0;
                std::size_t pos = 0;
                for (std::size_t i=0; i<ell; i++)
                {
                    double p = pref(i);
                    double num = (psum < 1e-6) ? ell - pos : std::min((double)(ell - pos), (ell - pos) * p / psum);
                    std::size_t n = (std::size_t)std::floor(num);
                    double prob = num - n;
                    //~ if (random::coinToss(rng,prob)) n++;
                    if (random::uni(rng) < prob) n++;
                    for (std::size_t j=0; j<n; j++)
                    {
                        schedule[pos] = i;
                        pos++;
                    }
                    psum -= p;
                    prefsum += p;
                }
                SHARK_ASSERT(pos == ell);
            }
 
            if (m_shrinking == true)
            {
                //~ for (std::size_t i=0; i<active; i++) 
                    //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), active - 1)]);
                std::shuffle(schedule.begin(),schedule.begin()+active,rng);
            }
            else
            {
                //~ for (std::size_t i=0; i<ell; i++) 
                    //~ std::swap(schedule[i], schedule[random::discrete(rng, std::size_t(0), ell - 1)]);
                std::shuffle(schedule.begin(),schedule.end(),rng);
            }
 
            // inner loop (one epoch)
            max_violation = 0.0;
            size_t nPoints = ell;
            if (m_shrinking == true)
                nPoints = active;
 
            for (std::size_t j=0; j<nPoints; j++)
            {
                // active example
                double gain = 0.0;
                const std::size_t i = schedule[j];
                InputReferenceType x_i = m_data[i].input;
                const unsigned int y_i = m_data[i].label;
                const double q = m_xSquared(i);
                blas::dense_vector_adaptor<double> a = row(alpha, i);
 
                // compute gradient and KKT violation
                RealVector wx = prod(w,x_i);
                RealVector g(m_classes);
                double kkt = calcGradient(g, wx, a, C, y_i);
 
                if (kkt > 0.0)
                {
                    max_violation = std::max(max_violation, kkt);
 
                    // perform the step on alpha
                    RealVector mu(m_classes, 0.0);
                    gain = solveSub(0.1 * stop.minAccuracy, g, q, C, y_i, a, mu);
                    objective += gain;
                    steps++;
 
                    // update weight vectors
                    updateWeightVectors(w, mu, i);
                }
                else if (m_shrinking == true)
                {
                    active--;
                    std::swap(schedule[j], schedule[active]);
                    j--;
                }
 
                // update gain-based preferences
                if (m_strategy == ACF)
                {
                    if (epoch == 0) average_gain += gain / (double)ell;
                    else
                    {
                        // strategy constants
                        constexpr double CHANGE_RATE = 0.2;
                        constexpr double PREF_MIN = 0.05;
                        constexpr double PREF_MAX = 20.0;
 
                        double change = CHANGE_RATE * (gain / average_gain - 1.0);
                        double newpref = std::min(PREF_MAX, std::max(PREF_MIN, pref(i) * std::exp(change)));
                        prefsum += newpref - pref(i);
                        pref(i) = newpref;
                        average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
                    }
                }
            }
 
            epoch++;
 
            // stopping criteria
            if (stop.maxIterations > 0 && epoch * ell >= stop.maxIterations)
            {
                if (prop != NULL) prop->type = QpMaxIterationsReached;
                break;
            }
 
            if (timer.stop() >= stop.maxSeconds)
            {
                if (prop != NULL) prop->type = QpTimeout;
                break;
            }
 
            if (max_violation < stop.minAccuracy)
            {
                if (verbose)
                    std::cout << "#" << std::flush;
                if (canstop)
                {
                    if (prop != NULL) prop->type = QpAccuracyReached;
                    break;
                }
                else
                {
                    if (m_strategy == ACF)
                    {
                        // prepare full sweep for a reliable checking of the stopping criterion
                        canstop = true;
                        for (std::size_t i=0; i<ell; i++) pref(i) = 1.0;
                        prefsum = (double)ell;
                    }
 
                    if (m_shrinking == true)
                    {
                        // prepare full sweep for a reliable checking of the stopping criterion
                        active = ell;
                        canstop = true;
                    }
                }
            }
            else
            {
                if (verbose) std::cout << "." << std::flush;
                if (m_strategy == ACF)
                    canstop = false;
                if (m_shrinking == true)
                    canstop = (active == ell);
            }
        }
        timer.stop();
 
        // calculate dual objective value
        objective = 0.0;
        for (std::size_t j=0; j<m_classes; j++)
        {
            for (std::size_t d=0; d<m_dim; d++) objective -= w(j, d) * w(j, d);
        }
        objective *= 0.5;
        for (std::size_t i=0; i<ell; i++)
        {
            for (std::size_t j=0; j<m_classes; j++) objective += alpha(i, j);
        }
 
        // return solution statistics
        if (prop != NULL)
        {
            prop->accuracy = max_violation;       // this is approximate, but a good guess
            prop->iterations = ell * epoch;
            prop->value = objective;
            prop->seconds = timer.lastLap();
        }
 
        // output solution statistics
        if (verbose)
        {
            std::cout << std::endl;
            std::cout << "training time (seconds): " << timer.lastLap() << std::endl;
            std::cout << "number of epochs: " << epoch << std::endl;
            std::cout << "number of iterations: " << (ell * epoch) << std::endl;
            std::cout << "number of non-zero steps: " << steps << std::endl;
            std::cout << "dual accuracy: " << max_violation << std::endl;
            std::cout << "dual objective value: " << objective << std::endl;
        }
 
        // return the solution
        return w;
    }
 
protected:
    // for all c: row(w, c) += mu(c) * x
    void add_scaled(RealMatrix& w, RealVector const& mu, InputReferenceType x)
    {
        for (std::size_t c=0; c<m_classes; c++) noalias(row(w, c)) += mu(c) * x;
    }
 
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    ///
    /// \param  gradient  gradient vector to be filled in. The vector is correctly sized.
    /// \param  wx        inner products of weight vectors with the current sample; wx(c) = <w_c, x>
    /// \param  alpha     variables corresponding to the current sample
    /// \param  C         upper bound on the variables
    /// \param  y         label of the current sample
    ///
    /// \return  The function must return the violation of the KKT conditions.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y) = 0;
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    ///
    /// \param  w   matrix of (dense) weight vectors (as rows)
    /// \param  mu  dual step on the variables corresponding to the current sample
    /// \param  index   current sample
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index) = 0;
 
    /// \brief Solve the sub-problem posed by a single training example.
    ///
    /// \param  epsilon   accuracy (dual gradient) up to which the sub-problem should be solved
    /// \param  gradient  gradient of the objective function w.r.t. alpha
    /// \param  q         squared norm of the current sample
    /// \param  C         upper bound on the variables
    /// \param  y         label of the current sample
    /// \param  alpha     input: initial point; output: (near) optimal point
    /// \param  mu        step from initial point to final point
    ///
    /// \return  The function must return the gain of the step, i.e., the improvement of the objective function.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu) = 0;
 
    DataView<const DatasetType> m_data;               ///< view on training data
    RealVector m_xSquared;                            ///< diagonal entries of the quadratic matrix
    std::size_t m_dim;                                ///< input space dimension
    std::size_t m_classes;                            ///< number of classes
    std::size_t m_strategy;                         ///< strategy for coordinate selection
    bool m_shrinking;                               ///< apply shrinking or not?
};
 
/// \brief Solver for the multi-class SVM by Weston & Watkins.
template <class InputT>
class QpMcLinearWW : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearWW(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        double violation = 0.0;
        for (std::size_t c=0; c<wx.size(); c++)
        {
            if (c == y)
            {
                gradient(c) = 0.0;
            }
            else
            {
                const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                gradient(c) = g;
                if (g > violation && alpha(c) < C) violation = g;
                else if (-g > violation && alpha(c) > 0.0) violation = -g;
            }
        }
        return violation;
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        double sum_mu = 0.0;
        for (std::size_t c=0; c<m_classes; c++) sum_mu += mu(c);
        unsigned int y = m_data[index].label;
        RealVector step(-0.5 * mu);
        step(y) = 0.5 * sum_mu;
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double qq = 0.5 * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            double kkt = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                if (c == y) continue;
 
                const double g = gradient(c);
                const double a = alpha(c);
                if (g > kkt && a < C) { kkt = g; idx = c; }
                else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
            }
 
            // check stopping criterion
            if (kkt < epsilon) break;
 
            // perform step
            const double a = alpha(idx);
            const double g = gradient(idx);
            double m = g / qq;
            double a_new = a + m;
            if (a_new <= 0.0)
            {
                m = -a;
                a_new = 0.0;
            }
            else if (a_new >= C)
            {
                m = C - a;
                a_new = C;
            }
            alpha(idx) = a_new;
            mu(idx) += m;
 
            // update gradient and total gain
            const double dg = 0.5 * m * qq;
            for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
            gradient(idx) -= dg;
 
            gain += m * (g - dg);
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class SVM by Lee, Lin & Wahba.
template <class InputT>
class QpMcLinearLLW : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearLLW(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        double violation = 0.0;
        for (std::size_t c=0; c<m_classes; c++)
        {
            if (c == y)
            {
                gradient(c) = 0.0;
            }
            else
            {
                const double g = 1.0 + wx(c);
                gradient(c) = g;
                if (g > violation && alpha(c) < C) violation = g;
                else if (-g > violation && alpha(c) > 0.0) violation = -g;
            }
        }
        return violation;
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        double mean_mu = 0.0;
        for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
        mean_mu /= (double)m_classes;
        RealVector step(m_classes);
        for (std::size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            double kkt = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                if (c == y) continue;
 
                const double g = gradient(c);
                const double a = alpha(c);
                if (g > kkt && a < C) { kkt = g; idx = c; }
                else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
            }
 
            // check stopping criterion
            if (kkt < epsilon) break;
 
            // perform step
            const double a = alpha(idx);
            const double g = gradient(idx);
            double m = g / qq;
            double a_new = a + m;
            if (a_new <= 0.0)
            {
                m = -a;
                a_new = 0.0;
            }
            else if (a_new >= C)
            {
                m = C - a;
                a_new = C;
            }
            alpha(idx) = a_new;
            mu(idx) += m;
 
            // update gradient and total gain
            const double dg = m * q;
            const double dgc = dg / m_classes;
            for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
            gradient(idx) -= dg;
 
            gain += m * (g - 0.5 * (dg - dgc));
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class SVM with absolute margin and total sum loss.
template <class InputT>
class QpMcLinearATS : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearATS(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        double violation = 0.0;
        for (std::size_t c=0; c<m_classes; c++)
        {
            const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
            gradient(c) = g;
            if (g > violation && alpha(c) < C) violation = g;
            else if (-g > violation && alpha(c) > 0.0) violation = -g;
        }
        return violation;
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        unsigned int y = m_data[index].label;
        double mean = -2.0 * mu(y);
        for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
        mean /= (double)m_classes;
        RealVector step(m_classes);
        for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            double kkt = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                const double g = gradient(c);
                const double a = alpha(c);
                if (g > kkt && a < C) { kkt = g; idx = c; }
                else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
            }
 
            // check stopping criterion
            if (kkt < epsilon) break;
 
            // perform step
            const double a = alpha(idx);
            const double g = gradient(idx);
            double m = g / qq;
            double a_new = a + m;
            if (a_new <= 0.0)
            {
                m = -a;
                a_new = 0.0;
            }
            else if (a_new >= C)
            {
                m = C - a;
                a_new = C;
            }
            alpha(idx) = a_new;
            mu(idx) += m;
 
            // update gradient and total gain
            const double dg = m * q;
            const double dgc = dg / m_classes;
            if (idx == y)
            {
                for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                gradient(idx) -= dg - 2.0 * dgc;
            }
            else
            {
                for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                gradient(idx) -= dg;
            }
 
            gain += m * (g - 0.5 * (dg - dgc));
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class maximum margin regression SVM
template <class InputT>
class QpMcLinearMMR : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearMMR(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        for (std::size_t c=0; c<m_classes; c++) gradient(c) = 0.0;
        const double g = 1.0 - wx(y);
        gradient(y) = g;
        const double a = alpha(0);
        if (g > 0.0)
        {
            if (a == C) return 0.0;
            else return g;
        }
        else
        {
            if (a == 0.0) return 0.0;
            else return -g;
        }
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        unsigned int y = m_data[index].label;
        double s = mu(0);
        double sc = -s / m_classes;
        double sy = s + sc;
        RealVector step(m_classes);
        for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? sy : sc;
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
 
        double kkt = 0.0;
        const double g = gradient(y);
        const double a = alpha(0);
        if (g > kkt && a < C) kkt = g;
        else if (-g > kkt && a > 0.0) kkt = -g;
 
        // check stopping criterion
        if (kkt < epsilon) return 0.0;
 
        // perform step
        double m = g / qq;
        double a_new = a + m;
        if (a_new <= 0.0)
        {
            m = -a;
            a_new = 0.0;
        }
        else if (a_new >= C)
        {
            m = C - a;
            a_new = C;
        }
        alpha(0) = a_new;
        mu(0) = m;
 
        // return the gain
        return m * (g - 0.5 * m * qq);
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class SVM by Crammer & Singer.
template <class InputT>
class QpMcLinearCS : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearCS(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        if (alpha(m_classes) < C)
        {
            double violation = 0.0;
            for (std::size_t c=0; c<wx.size(); c++)
            {
                if (c == y)
                {
                    gradient(c) = 0.0;
                }
                else
                {
                    const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                    gradient(c) = g;
                    if (g > violation) violation = g;
                    else if (-g > violation && alpha(c) > 0.0) violation = -g;
                }
            }
            return violation;
        }
        else
        {
            // double kkt_up = -1e100, kkt_down = 1e100;
            double kkt_up = 0.0, kkt_down = 1e100;
            for (std::size_t c=0; c<m_classes; c++)
            {
                if (c == y)
                {
                    gradient(c) = 0.0;
                }
                else
                {
                    const double g = 1.0 - 0.5 * (wx(y) - wx(c));
                    gradient(c) = g;
                    if (g > kkt_up && alpha(c) < C) kkt_up = g;
                    if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
                }
            }
            return std::max(0.0, kkt_up - kkt_down);
        }
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        unsigned int y = m_data[index].label;
        double sum_mu = 0.0;
        for (std::size_t c=0; c<m_classes; c++) if (c != y) sum_mu += mu(c);
        RealVector step(-0.5 * mu);
        step(y) = 0.5 * sum_mu;
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double qq = 0.5 * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            std::size_t idx_up = 0, idx_down = 0;
            bool size2 = false;
            double kkt = 0.0;
            double grad = 0.0;
            if (alpha(m_classes) == C)
            {
                double kkt_up = -1e100, kkt_down = 1e100;
                for (std::size_t c=0; c<m_classes; c++)
                {
                    if (c == y) continue;
 
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                    if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                }
 
                if (kkt_up <= 0.0)
                {
                    idx = idx_down;
                    grad = kkt_down;
                    kkt = -kkt_down;
                }
                else
                {
                    grad = kkt_up - kkt_down;
                    kkt = grad;
                    size2 = true;
                }
            }
            else
            {
                for (std::size_t c=0; c<m_classes; c++)
                {
                    if (c == y) continue;
 
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt) { kkt = g; idx = c; }
                    else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                }
                grad = gradient(idx);
            }
 
            // check stopping criterion
            if (kkt < epsilon) return gain;
 
            if (size2)
            {
                // perform step
                const double a_up = alpha(idx_up);
                const double a_down = alpha(idx_down);
                double m = grad / qq;
                double a_up_new = a_up + m;
                double a_down_new = a_down - m;
                if (a_down_new <= 0.0)
                {
                    m = a_down;
                    a_up_new = a_up + m;
                    a_down_new = 0.0;
                }
                alpha(idx_up) = a_up_new;
                alpha(idx_down) = a_down_new;
                mu(idx_up) += m;
                mu(idx_down) -= m;
 
                // update gradient and total gain
                const double dg = 0.5 * m * qq;
                gradient(idx_up) -= dg;
                gradient(idx_down) += dg;
                gain += m * (grad - 2.0 * dg);
            }
            else
            {
                // perform step
                const double a = alpha(idx);
                const double a_sum = alpha(m_classes);
                double m = grad / qq;
                double a_new = a + m;
                double a_sum_new = a_sum + m;
                if (a_new <= 0.0)
                {
                    m = -a;
                    a_new = 0.0;
                    a_sum_new = a_sum + m;
                }
                else if (a_sum_new >= C)
                {
                    m = C - a_sum;
                    a_sum_new = C;
                    a_new = a + m;
                }
                alpha(idx) = a_new;
                alpha(m_classes) = a_sum_new;
                mu(idx) += m;
 
                // update gradient and total gain
                const double dg = 0.5 * m * qq;
                for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dg;
                gradient(idx) -= dg;
                gain += m * (grad - dg);
            }
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class SVM with absolute margin and discriminative maximum loss.
template <class InputT>
class QpMcLinearADM : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearADM(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        if (alpha(m_classes) < C)
        {
            double violation = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                if (c == y)
                {
                    gradient(c) = 0.0;
                }
                else
                {
                    const double g = 1.0 + wx(c);
                    gradient(c) = g;
                    if (g > violation) violation = g;
                    else if (-g > violation && alpha(c) > 0.0) violation = -g;
                }
            }
            return violation;
        }
        else
        {
            double kkt_up = 0.0, kkt_down = 1e100;
            for (std::size_t c=0; c<m_classes; c++)
            {
                if (c == y)
                {
                    gradient(c) = 0.0;
                }
                else
                {
                    const double g = 1.0 + wx(c);
                    gradient(c) = g;
                    if (g > kkt_up && alpha(c) < C) kkt_up = g;
                    if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
                }
            }
            return std::max(0.0, kkt_up - kkt_down);
        }
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        double mean_mu = 0.0;
        for (std::size_t c=0; c<m_classes; c++) mean_mu += mu(c);
        mean_mu /= (double)m_classes;
        RealVector step(m_classes);
        for (size_t c=0; c<m_classes; c++) step(c) = mean_mu - mu(c);
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            std::size_t idx_up = 0, idx_down = 0;
            bool size2 = false;
            double kkt = 0.0;
            double grad = 0.0;
            if (alpha(m_classes) == C)
            {
                double kkt_up = -1e100, kkt_down = 1e100;
                for (std::size_t c=0; c<m_classes; c++)
                {
                    if (c == y) continue;
 
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                    if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                }
 
                if (kkt_up <= 0.0)
                {
                    idx = idx_down;
                    grad = kkt_down;
                    kkt = -kkt_down;
                }
                else
                {
                    grad = kkt_up - kkt_down;
                    kkt = grad;
                    size2 = true;
                }
            }
            else
            {
                for (std::size_t c=0; c<m_classes; c++)
                {
                    if (c == y) continue;
 
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt) { kkt = g; idx = c; }
                    else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                }
                grad = gradient(idx);
            }
 
            // check stopping criterion
            if (kkt < epsilon) return gain;
 
            if (size2)
            {
                // perform step
                const double a_up = alpha(idx_up);
                const double a_down = alpha(idx_down);
                double m = grad / (2.0 * q);
                double a_up_new = a_up + m;
                double a_down_new = a_down - m;
                if (a_down_new <= 0.0)
                {
                    m = a_down;
                    a_up_new = a_up + m;
                    a_down_new = 0.0;
                }
                alpha(idx_up) = a_up_new;
                alpha(idx_down) = a_down_new;
                mu(idx_up) += m;
                mu(idx_down) -= m;
 
                // update gradient and total gain
                const double dg = m * q;
                const double dgc = dg / m_classes;
                gradient(idx_up) -= dg;
                gradient(idx_down) += dg;
                gain += m * (grad - (dg - dgc));
            }
            else
            {
                // perform step
                const double a = alpha(idx);
                const double a_sum = alpha(m_classes);
                double m = grad / qq;
                double a_new = a + m;
                double a_sum_new = a_sum + m;
                if (a_new <= 0.0)
                {
                    m = -a;
                    a_new = 0.0;
                    a_sum_new = a_sum + m;
                }
                else if (a_sum_new >= C)
                {
                    m = C - a_sum;
                    a_sum_new = C;
                    a_new = a + m;
                }
                alpha(idx) = a_new;
                alpha(m_classes) = a_sum_new;
                mu(idx) += m;
 
                // update gradient and total gain
                const double dg = m * q;
                const double dgc = dg / m_classes;
                for (std::size_t c=0; c<m_classes; c++) gradient(c) += dgc;
                gradient(idx) -= dg;
                gain += m * (grad - 0.5 * (dg - dgc));
            }
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the multi-class SVM with absolute margin and total maximum loss.
template <class InputT>
class QpMcLinearATM : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearATM(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        if (alpha(m_classes) < C)
        {
            double violation = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
                gradient(c) = g;
                if (g > violation) violation = g;
                else if (-g > violation && alpha(c) > 0.0) violation = -g;
            }
            return violation;
        }
        else
        {
            double kkt_up = 0.0, kkt_down = 1e100;
            for (std::size_t c=0; c<m_classes; c++)
            {
                const double g = (c == y) ? 1.0 - wx(y) : 1.0 + wx(c);
                gradient(c) = g;
                if (g > kkt_up && alpha(c) < C) kkt_up = g;
                if (g < kkt_down && alpha(c) > 0.0) kkt_down = g;
            }
            return std::max(0.0, kkt_up - kkt_down);
        }
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        unsigned int y = m_data[index].label;
        double mean = -2.0 * mu(y);
        for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
        mean /= (double)m_classes;
        RealVector step(m_classes);
        for (size_t c=0; c<m_classes; c++) step(c) = (c == y) ? (mu(c) + mean) : (mean - mu(c));
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            std::size_t idx_up = 0, idx_down = 0;
            bool size2 = false;
            double kkt = 0.0;
            double grad = 0.0;
            if (alpha(m_classes) == C)
            {
                double kkt_up = -1e100, kkt_down = 1e100;
                for (std::size_t c=0; c<m_classes; c++)
                {
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt_up && a < C) { kkt_up = g; idx_up = c; }
                    if (g < kkt_down && a > 0.0) { kkt_down = g; idx_down = c; }
                }
 
                if (kkt_up <= 0.0)
                {
                    idx = idx_down;
                    grad = kkt_down;
                    kkt = -kkt_down;
                }
                else
                {
                    grad = kkt_up - kkt_down;
                    kkt = grad;
                    size2 = true;
                }
            }
            else
            {
                for (std::size_t c=0; c<m_classes; c++)
                {
                    const double g = gradient(c);
                    const double a = alpha(c);
                    if (g > kkt) { kkt = g; idx = c; }
                    else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
                }
                grad = gradient(idx);
            }
 
            // check stopping criterion
            if (kkt < epsilon) return gain;
 
            if (size2)
            {
                // perform step
                const double a_up = alpha(idx_up);
                const double a_down = alpha(idx_down);
                double m = grad / (2.0 * q);
                double a_up_new = a_up + m;
                double a_down_new = a_down - m;
                if (a_down_new <= 0.0)
                {
                    m = a_down;
                    a_up_new = a_up + m;
                    a_down_new = 0.0;
                }
                alpha(idx_up) = a_up_new;
                alpha(idx_down) = a_down_new;
                mu(idx_up) += m;
                mu(idx_down) -= m;
 
                // update gradient and total gain
                const double dg = m * q;
                const double dgc = dg / m_classes;
                if (idx_up == y)
                {
                    for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                    gradient(idx_up) -= dg - 2.0 * dgc;
                    gradient(idx_down) += dg;
                }
                else if (idx_down == y)
                {
                    gradient(idx_up) -= dg;
                    gradient(idx_down) += dg - 2.0 * dgc;
                }
                else
                {
                    gradient(idx_up) -= dg;
                    gradient(idx_down) += dg;
                }
                gain += m * (grad - (dg - dgc));
            }
            else
            {
                // perform step
                const double a = alpha(idx);
                const double a_sum = alpha(m_classes);
                double m = grad / qq;
                double a_new = a + m;
                double a_sum_new = a_sum + m;
                if (a_new <= 0.0)
                {
                    m = -a;
                    a_new = 0.0;
                    a_sum_new = a_sum + m;
                }
                else if (a_sum_new >= C)
                {
                    m = C - a_sum;
                    a_sum_new = C;
                    a_new = a + m;
                }
                alpha(idx) = a_new;
                alpha(m_classes) = a_sum_new;
                mu(idx) += m;
 
                // update gradient and total gain
                const double dg = m * q;
                const double dgc = dg / m_classes;
                if (idx == y)
                {
                    for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                    gradient(idx) -= dg - 2.0 * dgc;
                }
                else
                {
                    for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                    gradient(idx) -= dg;
                }
                gain += m * (grad - 0.5 * (dg - dgc));
            }
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
/// \brief Solver for the "reinforced" multi-class SVM.
template <class InputT>
class QpMcLinearReinforced : public QpMcLinear<InputT>
{
public:
    typedef LabeledData<InputT, unsigned int> DatasetType;
 
    /// \brief Constructor
    QpMcLinearReinforced(
            const DatasetType& dataset,
            std::size_t dim,
            std::size_t classes)
    : QpMcLinear<InputT>(dataset, dim, classes)
    { }
 
protected:
    /// \brief Compute the gradient from the inner products of the weight vectors with the current sample.
    virtual double calcGradient(RealVector& gradient, RealVector wx, blas::dense_vector_adaptor<double const> const& alpha, double C, unsigned int y)
    {
        double violation = 0.0;
        for (std::size_t c=0; c<m_classes; c++)
        {
            const double g = (c == y) ? (m_classes - 1.0) - wx(y) : 1.0 + wx(c);
            gradient(c) = g;
            if (g > violation && alpha(c) < C) violation = g;
            else if (-g > violation && alpha(c) > 0.0) violation = -g;
        }
        return violation;
    }
 
    /// \brief Update the weight vectors (primal variables) after a step on the dual variables.
    virtual void updateWeightVectors(RealMatrix& w, RealVector const& mu, std::size_t index)
    {
        unsigned int y = m_data[index].label;
        double mean = -2.0 * mu(y);
        for (std::size_t c=0; c<m_classes; c++) mean += mu(c);
        mean /= (double)m_classes;
        RealVector step(m_classes);
        for (std::size_t c=0; c<m_classes; c++) step(c) = ((c == y) ? (mu(c) + mean) : (mean - mu(c)));
        add_scaled(w, step, m_data[index].input);
    }
 
    /// \brief Solve the sub-problem posed by a single training example.
    virtual double solveSub(double epsilon, RealVector& gradient, double q, double C, unsigned int y, blas::dense_vector_adaptor<double>& alpha, RealVector& mu)
    {
        const double ood = 1.0 / m_classes;
        const double qq = (1.0 - ood) * q;
        double gain = 0.0;
 
        // SMO loop
        size_t iter, maxiter = 10 * m_classes;
        for (iter=0; iter<maxiter; iter++)
        {
            // select working set
            std::size_t idx = 0;
            double kkt = 0.0;
            for (std::size_t c=0; c<m_classes; c++)
            {
                const double g = gradient(c);
                const double a = alpha(c);
                if (g > kkt && a < C) { kkt = g; idx = c; }
                else if (-g > kkt && a > 0.0) { kkt = -g; idx = c; }
            }
 
            // check stopping criterion
            if (kkt < epsilon) break;
 
            // perform step
            const double a = alpha(idx);
            const double g = gradient(idx);
            double m = g / qq;
            double a_new = a + m;
            if (a_new <= 0.0)
            {
                m = -a;
                a_new = 0.0;
            }
            else if (a_new >= C)
            {
                m = C - a;
                a_new = C;
            }
            alpha(idx) = a_new;
            mu(idx) += m;
 
            // update gradient and total gain
            const double dg = m * q;
            const double dgc = dg / m_classes;
            if (idx == y)
            {
                for (std::size_t c=0; c<m_classes; c++) gradient(c) -= dgc;
                gradient(idx) -= dg - 2.0 * dgc;
            }
            else
            {
                for (std::size_t c=0; c<m_classes; c++) gradient(c) += (c == y) ? -dgc : dgc;
                gradient(idx) -= dg;
            }
 
            gain += m * (g - 0.5 * (dg - dgc));
        }
 
        return gain;
    }
 
protected:
    using QpMcLinear<InputT>::add_scaled;
    using QpMcLinear<InputT>::m_data;
    using QpMcLinear<InputT>::m_classes;
};
 
 
}
#endif