include/shark/Algorithms/Trainers/MissingFeatureSvmTrainer.h Source File

Go to the documentation of this file.
//===========================================================================
/*!
 * 
 *
 * \brief       Trainer for binary SVMs natively supporting missing features.
 * 
 * 
 *
 * \author      B. Li
 * \date        2012
 *
 *
 * \par Copyright 1995-2017 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <https://shark-ml.github.io/Shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================
 
#ifndef SHARK_ALGORITHMS_TRAINERS_MISSING_FEATURE_SVM_H
#define SHARK_ALGORITHMS_TRAINERS_MISSING_FEATURE_SVM_H
 
#include "shark/Algorithms/Trainers/AbstractSvmTrainer.h"
#include "shark/Models/Kernels/EvalSkipMissingFeatures.h"
#include "shark/Models/Kernels/KernelExpansion.h"
#include "shark/Models/Kernels/MissingFeaturesKernelExpansion.h"
#include <shark/Algorithms/QP/BoxConstrainedProblems.h>
#include <shark/Algorithms/QP/SvmProblems.h>
#include <shark/LinAlg/CachedMatrix.h>
#include <shark/LinAlg/ExampleModifiedKernelMatrix.h>
 
namespace shark {
 
/// \brief Trainer for binary SVMs natively supporting missing features.
///
/// This is a specialized variant of the standard binary C-SVM which can be used
/// to learn from data with missing features, without the need for prior imputation
/// of missing values. The key idea is that each example is considered as having an
/// instance-specific margin value, which is computed in the lower-dimensional subspace
/// for which all features of that example are present.
///
/// The resulting optimization problem has the drawback that it is not convex any
/// more. However, a local minimum can be easily reached by an iterative wrapper
/// algorithm around a standard C-SVM solver. In detail, example-wise weights \f$ s_i \f$
/// are incorporated into the quadratic term of the standard SVM optimization problem.
/// These take initial values of 1, and are then iteratively updated according to the
/// instance-specific margin values. After each such update, the SVM solver is again called,
/// and so on. Usually, between 2 and 5 iterations have been shown to be sufficient for
/// an acceptable level of convergence.
///
/// For details, see the paper:<br/>
/// <p>Max-margin Classification of %Data with Absent Features
/// Gal Chechik, Geremy Heitz, Gal Elidan, Pieter Abbeel and Daphne Koller. JMLR 2008.</p>
///
/// \note Much of the code in this class is borrowed from the class CSvmTrainer
/// \ingroup supervised_trainer
template <class InputType, class CacheType = float>
class MissingFeatureSvmTrainer : public AbstractSvmTrainer<InputType, unsigned int,MissingFeaturesKernelExpansion<InputType> >
{
protected:
 
    typedef CacheType QpFloatType;
    typedef AbstractKernelFunction<InputType> KernelType;
    typedef AbstractSvmTrainer<InputType, unsigned int,MissingFeaturesKernelExpansion<InputType> > base_type;
 
public:
 
    /// Constructor
    MissingFeatureSvmTrainer(KernelType* kernel, double C, bool offset, bool unconstrained = false)
    :
        base_type(kernel, C, offset, unconstrained),
        m_maxIterations(4u)
    { }
 
    /// \brief From INameable: return the class name.
    std::string name() const
    { return "MissingFeatureSvmTrainer"; }
 
    void train(MissingFeaturesKernelExpansion<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset){
        // Check prerequisites
        SHARK_RUNTIME_CHECK(numberOfClasses(dataset) == 2, "Not a binary problem");
 
        svm.setStructure(base_type::m_kernel,dataset.inputs(), this->m_trainOffset);
        
        if(svm.hasOffset())
            trainWithOffset(svm,dataset);
        else
            trainWithoutOffset(svm,dataset);
    }
 
    void setMaxIterations(std::size_t newIterations)
    { m_maxIterations = newIterations; }
 
private:
 
    /// Number of iterations to run
    std::size_t m_maxIterations;
 
    void trainWithoutOffset(MissingFeaturesKernelExpansion<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset)
    {
        
        // Initialize scaling coefficients as 1.0
        RealVector scalingCoefficients(dataset.numberOfElements(), 1.0);
 
        // What body of this loop does:
        //   *) Solve the QP with a normal solver treating s_i as constants
        //   *) Calculate norms: w_i and w
        //   *) Update s_i with w_i / w
        for(std::size_t iteration = 0; iteration != m_maxIterations; ++iteration){
            //Set up the problem
            typedef ExampleModifiedKernelMatrix<InputType, QpFloatType> MatrixType;
            typedef CachedMatrix<MatrixType> CachedMatrixType;
            typedef CSVMProblem<CachedMatrixType> SVMProblemType;
            typedef BoxConstrainedShrinkingProblem<SVMProblemType> ProblemType;
            MatrixType kernelMatrix(*base_type::m_kernel, dataset.inputs());
            kernelMatrix.setScalingCoefficients(scalingCoefficients);
            CachedMatrixType matrix(&kernelMatrix);
            SVMProblemType svmProblem(matrix,dataset.labels(),this->C());
            ProblemType problem(svmProblem,base_type::m_shrinking);
            
            //solve it
            QpSolver< ProblemType > solver(problem);
            solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
            RealVector alpha = problem.getUnpermutedAlpha();
            
            //update s_i and w_i
            const double classifierNorm = svm.computeNorm(alpha, scalingCoefficients);
            SHARK_ASSERT(classifierNorm > 0.0);
            for (std::size_t i = 0; i < scalingCoefficients.size(); ++i)
            {
                // Update scaling coefficients
                scalingCoefficients(i) = svm.computeNorm(
                    alpha,
                    scalingCoefficients,
                    dataset.element(i).input)
                    / classifierNorm;
            }
            
            //store alpha in the last iteration inside the svm
            if(iteration == m_maxIterations-1)
                column(svm.alpha(),0)= alpha;
 
            //keep track of number of kernel evaluations
            base_type::m_accessCount += kernelMatrix.getAccessCount();
        }
        svm.setScalingCoefficients(scalingCoefficients);
    }
    void trainWithOffset(MissingFeaturesKernelExpansion<InputType>& svm, LabeledData<InputType, unsigned int> const& dataset)
    {
        // Initialize scaling coefficients as 1.0
        std::size_t datasetSize = dataset.numberOfElements();
        RealVector scalingCoefficients(datasetSize, 1.0);
 
        // What body of this loop does:
        //   *) Solve the QP with a normal solver treating s_i as constants
        //   *) Calculate norms: w_i and w
        //   *) Update s_i with w_i / w
        for(std::size_t iteration = 0; iteration != m_maxIterations; ++iteration){
            //Set up the problem
            typedef ExampleModifiedKernelMatrix<InputType, QpFloatType> MatrixType;
            typedef CachedMatrix<MatrixType> CachedMatrixType;
            typedef CSVMProblem<CachedMatrixType> SVMProblemType;
            typedef SvmShrinkingProblem<SVMProblemType> ProblemType;
            MatrixType kernelMatrix(*base_type::m_kernel, dataset.inputs());
            kernelMatrix.setScalingCoefficients(scalingCoefficients);
            CachedMatrixType matrix(&kernelMatrix);
            SVMProblemType svmProblem(matrix,dataset.labels(),this->C());
            ProblemType problem(svmProblem,base_type::m_shrinking);
            
            //solve it
            QpSolver< ProblemType > solver(problem);
            solver.solve(base_type::stoppingCondition(), &base_type::solutionProperties());
            RealVector unpermutedAlpha = problem.getUnpermutedAlpha();
            
            //update s_i and w_i
            const double classifierNorm = svm.computeNorm(unpermutedAlpha, scalingCoefficients);
            SHARK_ASSERT(classifierNorm > 0.0);
            for (std::size_t i = 0; i < scalingCoefficients.size(); ++i)
            {
                // Update scaling coefficients
                scalingCoefficients(i) = svm.computeNorm(
                    unpermutedAlpha,
                    scalingCoefficients,
                    dataset.element(i).input
                )/ classifierNorm;
            }
            
            
            if(iteration == m_maxIterations-1){
                //in the last tieration,y
                // Compute the offset(i.e., b or Bias) and push it along with alpha to SVM
                column(svm.alpha(),0)= unpermutedAlpha;
                double lowerBound = -1e100;
                double upperBound = 1e100;
                double sum = 0.0;
                std::size_t freeVars = 0;
 
                // No reason to init to 0, but avoid compiler warnings
                for (std::size_t i = 0; i < datasetSize; i++)
                {
                    // In case of no free SVs, we are looking for the largest gradient of all alphas at the lower bound
                    // and the smallest gradient of all alphas at the upper bound
                    const double value = problem.gradient(i);
                    if (problem.alpha(i) == problem.boxMin(i)){
                        lowerBound = std::max(value,lowerBound);
                    }
                    else if (problem.alpha(i) == problem.boxMax(i)){
                        upperBound = std::min(value,upperBound);
                    }
                    else{
                        sum += value;
                        freeVars++;
                    }
                }
 
                if (freeVars > 0) {
                    // Stabilized (averaged) exact value
                    svm.offset(0) = sum / freeVars;
                } else {
                    // TODO: need work out how to do the calculation of the derivative with missing features
 
                    // Return best estimate
                    svm.offset(0) = 0.5 * (lowerBound + upperBound);
                }
            }
 
            //keep track of number of kernel evaluations
            base_type::m_accessCount += kernelMatrix.getAccessCount();
        }
        svm.setScalingCoefficients(scalingCoefficients);
    }
 
};
 
} // namespace shark {
 
#endif