include/shark/Models/Kernels/WeightedSumKernel.h Source File

Go to the documentation of this file.
//===========================================================================
/*!
 * 
 *
 * \brief       Weighted sum of m_base kernels.
 * 
 * 
 *
 * \author      T.Glasmachers, O. Krause, M. Tuma
 * \date        2010, 2011
 *
 *
 * \par Copyright 1995-2017 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <https://shark-ml.github.io/Shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
//===========================================================================
 
#ifndef SHARK_MODELS_KERNELS_WEIGHTED_SUM_KERNEL_H
#define SHARK_MODELS_KERNELS_WEIGHTED_SUM_KERNEL_H
 
 
#include <shark/Models/Kernels/AbstractKernelFunction.h>
 
#include <boost/utility/enable_if.hpp>
namespace shark {
 
 
/// \brief Weighted sum of kernel functions
///
/// For a set of positive definite kernels \f$ k_1, \dots, k_n \f$
/// with positive coeffitients \f$ w_1, \dots, w_n \f$ the sum
/// \f[ \tilde k(x_1, x_2) := \sum_{i=1}^{n} w_i \cdot k_i(x_1, x_2) \f]
/// is again a positive definite kernel function.
/// Internally, the weights are represented as
/// \f$ w_i = \exp(\xi_i) \f$
/// to allow for unconstrained optimization.
///
/// This variant of the weighted sum kernel eleminates one redundant
/// degree of freedom by fixing the first kernel weight to 1.0.
///
/// The result of the kernel evaluation is devided by the sum of the
/// kernel weights, so that in total, this amounts to fixing the sum
/// of the of the weights to one.
///
/// \ingroup kernels
template<class InputType=RealVector>
class WeightedSumKernel : public AbstractKernelFunction<InputType>
{
private:
    typedef AbstractKernelFunction<InputType> base_type;
 
    struct InternalState: public State{
        RealMatrix result;
        std::vector<RealMatrix> kernelResults;
        std::vector<boost::shared_ptr<State> > kernelStates;
 
        InternalState(std::size_t numSubKernels)
        :kernelResults(numSubKernels),kernelStates(numSubKernels){}
 
        void resize(std::size_t sizeX1, std::size_t sizeX2){
            result.resize(sizeX1, sizeX2);
            for(std::size_t i = 0; i != kernelResults.size(); ++i){
                kernelResults[i].resize(sizeX1, sizeX2);
            }
        }
    };
public:
    typedef typename base_type::BatchInputType BatchInputType;
    typedef typename base_type::ConstInputReference ConstInputReference;
    typedef typename base_type::ConstBatchInputReference ConstBatchInputReference;
 
    WeightedSumKernel(std::vector<AbstractKernelFunction<InputType>* > const& base){
        SHARK_RUNTIME_CHECK( base.size() > 0, "[WeightedSumKernel::WeightedSumKernel] There should be at least one sub-kernel.");
 
        m_base.resize( base.size() );
        m_numParameters = m_base.size()-1;
        m_adaptWeights = true;
        for (std::size_t i=0; i != m_base.size() ; i++) {
            SHARK_ASSERT( base[i] != NULL );
            m_base[i].kernel = base[i];
            m_base[i].weight = 1.0;
            m_base[i].adaptive = false;
        }
        m_weightsum = (double)m_base.size();
 
        // set m_base class features
        bool hasFirstParameterDerivative = true;
        for ( unsigned int i=0; i<m_base.size(); i++ ){
            if ( !m_base[i].kernel->hasFirstParameterDerivative() ) {
                hasFirstParameterDerivative = false;
                break;
            }
        }
        bool hasFirstInputDerivative = true;
        for ( unsigned int i=0; i<m_base.size(); i++ ){
            if ( !m_base[i].kernel->hasFirstInputDerivative() ) {
                hasFirstInputDerivative = false;
                break;
            }
        }
 
        if ( hasFirstParameterDerivative )
            this->m_features|=base_type::HAS_FIRST_PARAMETER_DERIVATIVE;
 
        if ( hasFirstInputDerivative )
            this->m_features|=base_type::HAS_FIRST_INPUT_DERIVATIVE;
    }
 
    /// \brief From INameable: return the class name.
    std::string name() const
    { return "WeightedSumKernel"; }
 
    /// Check whether m_base kernel index is adaptive.
    bool isAdaptive( std::size_t index ) const {
        return m_base[index].adaptive;
    }
    /// Set adaptivity of m_base kernel index.
    void setAdaptive( std::size_t index, bool b = true ) {
        m_base[index].adaptive = b;
        updateNumberOfParameters();
    }
    /// Set adaptivity of all m_base kernels.
    void setAdaptiveAll( bool b = true ) {
        for (std::size_t i=0; i!= m_base.size(); i++)
            m_base[i].adaptive = b;
        updateNumberOfParameters();
    }
 
    /// Get the weight of a kernel
    double weight(std::size_t index){
        RANGE_CHECK(index < m_base.size());
        return m_base[index].weight;
    }
    
    void setAdaptiveWeights(bool b){
        m_adaptWeights = b;
    }
 
    /// return the parameter vector. The first N-1 entries are the (log-encoded) kernel
    /// weights, the sub-kernel's parameters are stacked behind each other after that.
    RealVector parameterVector() const {
        std::size_t num = numberOfParameters();
        RealVector ret(num);
 
        std::size_t index = 0;
        for (; index != m_base.size()-1; index++){
            ret(index) = std::log(m_base[index+1].weight);
 
        }
        for (std::size_t i=0; i != m_base.size(); i++){
            if (m_base[i].adaptive){
                std::size_t n = m_base[i].kernel->numberOfParameters();
                subrange(ret,index,index+n) = m_base[i].kernel->parameterVector();
                index += n;
            }
        }
        return ret;
    }
 
    ///\brief creates the internal state of the kernel
    boost::shared_ptr<State> createState()const{
        InternalState* state = new InternalState(m_base.size());
        for(std::size_t i = 0; i != m_base.size(); ++i){
            state->kernelStates[i]=m_base[i].kernel->createState();
        }
        return boost::shared_ptr<State>(state);
    }
 
    /// set the parameter vector. The first N-1 entries are the (log-encoded) kernel
    /// weights, the sub-kernel's parameters are stacked behind each other after that.
    void setParameterVector(RealVector const& newParameters) {
        SIZE_CHECK(newParameters.size() == numberOfParameters());
 
        m_weightsum = 1.0;
        std::size_t index = 0;
        for (; index != m_base.size()-1; index++){
            double w = newParameters(index);
            m_base[index+1].weight = std::exp(w);
            m_weightsum += m_base[index+1].weight;
 
        }
        for (std::size_t i=0; i != m_base.size(); i++){
            if (m_base[i].adaptive){
                std::size_t n = m_base[i].kernel->numberOfParameters();
                m_base[i].kernel->setParameterVector(subrange(newParameters,index,index+n));
                index += n;
            }
        }
    }
 
    std::size_t numberOfParameters() const {
        return m_numParameters;
    }
 
    /// Evaluate the weighted sum kernel (according to the following equation:)
    /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
    double eval(ConstInputReference x1, ConstInputReference x2) const{
        double numerator = 0.0;
        for (std::size_t i=0; i != m_base.size(); i++){
            double result = m_base[i].kernel->eval(x1, x2);
            numerator += m_base[i].weight*result;
        }
        return numerator / m_weightsum;
    }
 
    /// Evaluate the kernel according to the equation:
    /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
    /// for two batches of inputs.
    void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result) const{
        std::size_t sizeX1 = batchSize(batchX1);
        std::size_t sizeX2 = batchSize(batchX2);
        ensure_size(result,sizeX1,sizeX2);
        result.clear();
 
        RealMatrix kernelResult(sizeX1,sizeX2);
        for (std::size_t i = 0; i != m_base.size(); i++){
            m_base[i].kernel->eval(batchX1, batchX2,kernelResult);
            result += m_base[i].weight*kernelResult;
        }
        result /= m_weightsum;
    }
 
    /// Evaluate the kernel according to the equation:
    /// \f$ k(x, y) = \frac{\sum_i \exp(w_i) k_i(x, y)}{sum_i exp(w_i)} \f$
    /// for two batches of inputs.
    /// (see the documentation of numberOfIntermediateValues for the workings of the intermediates)
    void eval(ConstBatchInputReference batchX1, ConstBatchInputReference batchX2, RealMatrix& result, State& state) const{
        std::size_t sizeX1 = batchSize(batchX1);
        std::size_t sizeX2 = batchSize(batchX2);
        ensure_size(result,sizeX1,sizeX2);
        result.clear();
 
        InternalState& s = state.toState<InternalState>();
        s.resize(sizeX1,sizeX2);
 
        for (std::size_t i=0; i != m_base.size(); i++){
            m_base[i].kernel->eval(batchX1,batchX2,s.kernelResults[i],*s.kernelStates[i]);
            result += m_base[i].weight*s.kernelResults[i];
        }
        //store summed result
        s.result=result;
        result /= m_weightsum;
    }
 
    void weightedParameterDerivative(
        ConstBatchInputReference batchX1,
        ConstBatchInputReference batchX2,
        RealMatrix const& coefficients,
        State const& state,
        RealVector& gradient
    ) const{
        ensure_size(gradient,numberOfParameters());
 
        std::size_t numKernels = m_base.size(); //how far the loop goes;
 
        InternalState const& s = state.toState<InternalState>();
 
        double sumSquared = sqr(m_weightsum); //denominator
 
        //first the derivative with respect to the (log-encoded) weight parameter
        //the first weight is not a parameter and does not need a gradient.
        //[Theoretically, we wouldn't need to store its result .]
        //calculate the weighted sum over all results
        double numeratorSum = sum(coefficients * s.result);
        for (std::size_t i = 1; i != numKernels && m_adaptWeights; i++) {
            //calculate the weighted sum over all results of this kernel
            double summedK=sum(coefficients * s.kernelResults[i]);
            gradient(i-1) = m_base[i].weight * (summedK * m_weightsum - numeratorSum) / sumSquared;
        }
 
        std::size_t gradPos = m_adaptWeights ? numKernels-1: 0; //starting position of subkerel gradient
        RealVector kernelGrad;
        for (std::size_t i=0; i != numKernels; i++) {
            if (isAdaptive(i)){
                double coeff = m_base[i].weight / m_weightsum;
                m_base[i].kernel->weightedParameterDerivative(batchX1,batchX2,coefficients,*s.kernelStates[i],kernelGrad);
                std::size_t n = kernelGrad.size();
                noalias(subrange(gradient,gradPos,gradPos+n)) = coeff * kernelGrad;
                gradPos += n;
            }
        }
    }
 
    /// Input derivative, calculated according to the equation:
    /// <br/>
    /// \f$ \frac{\partial k(x, y)}{\partial x}
    ///     \frac{\sum_i \exp(w_i) \frac{\partial k_i(x, y)}{\partial x}}
    ///          {\sum_i exp(w_i)} \f$
    /// and summed up over all  of the second batch
    void weightedInputDerivative(
        ConstBatchInputReference batchX1,
        ConstBatchInputReference batchX2,
        RealMatrix const& coefficientsX2,
        State const& state,
        BatchInputType& gradient
    ) const{
        SIZE_CHECK(coefficientsX2.size1() == batchSize(batchX1));
        SIZE_CHECK(coefficientsX2.size2() == batchSize(batchX2));
        weightedInputDerivativeImpl<BatchInputType>(batchX1,batchX2,coefficientsX2,state,gradient);
    }
 
    void read(InArchive& ar){
        for(std::size_t i = 0;i != m_base.size(); ++i ){
            ar >> m_base[i].weight;
            ar >> m_base[i].adaptive;
            ar >> *(m_base[i].kernel);
        }
        ar >> m_weightsum;
        ar >> m_numParameters;
    }
    void write(OutArchive& ar) const{
        for(std::size_t i=0;i!= m_base.size();++i){
            ar << m_base[i].weight;
            ar << m_base[i].adaptive;
            ar << const_cast<AbstractKernelFunction<InputType> const&>(*(m_base[i].kernel));
        }
        ar << m_weightsum;
        ar << m_numParameters;
    }
 
protected:
    /// structure describing a single m_base kernel
    struct tBase
    {
        AbstractKernelFunction<InputType>* kernel;  ///< pointer to the m_base kernel object
        double weight;                              ///< weight in the linear combination
        bool adaptive;                              ///< whether the parameters of the kernel are part of the WeightedSumKernel's parameter vector?
    };
 
    void updateNumberOfParameters(){
        m_numParameters = m_adaptWeights? m_base.size()-1 : 0;
        for (std::size_t i=0; i != m_base.size(); i++)
            if (m_base[i].adaptive)
                m_numParameters += m_base[i].kernel->numberOfParameters();
    }
 
    //we need to choose the correct implementation at compile time to ensure, that in the case, InputType
    //does not implement the needed operations, the implementation is replaced by a safe default which throws an exception
    //for this, we use enable_if/disable_if. The method is called with BatchInputType as template argument.
    //real implementation first.
    template <class T>
    void weightedInputDerivativeImpl(
        ConstBatchInputReference batchX1,
        ConstBatchInputReference batchX2,
        RealMatrix const& coefficientsX2,
        State const& state,
        BatchInputType& gradient,
        typename boost::enable_if<boost::is_same<T,RealMatrix > >::type* dummy = 0
    )const{
        std::size_t numKernels = m_base.size(); //how far the loop goes;
        InternalState const& s = state.toState<InternalState>();
 
 
        //initialize gradient with the first kernel
        m_base[0].kernel->weightedInputDerivative(batchX1, batchX2, coefficientsX2, *s.kernelStates[0], gradient);
        gradient *= m_base[0].weight / m_weightsum;
        BatchInputType kernelGrad;
        for (std::size_t i=1; i != numKernels; i++){
            m_base[i].kernel->weightedInputDerivative(batchX1, batchX2, coefficientsX2, *s.kernelStates[i], kernelGrad);
            double coeff = m_base[i].weight / m_weightsum;
            gradient += coeff * kernelGrad;
        }
    }
    template <class T>
    void weightedInputDerivativeImpl(
        ConstBatchInputReference batchX1,
        ConstBatchInputReference batchX2,
        RealMatrix const& coefficientsX2,
        State const& state,
        BatchInputType& gradient,
        typename boost::disable_if<boost::is_same<T,RealMatrix > >::type* dummy = 0
    )const{
        throw SHARKEXCEPTION("[WeightedSumKernel::weightdInputDerivative] The used BatchInputType is no Vector");
    }
 
    std::vector<tBase> m_base;                      ///< collection of m_base kernels
    double m_weightsum;                             ///< sum of all weights
    std::size_t m_numParameters;                   ///< total number of parameters
    bool m_adaptWeights;                           ///< whether the weights should be adapted
};
 
typedef WeightedSumKernel<> DenseWeightedSumKernel;
typedef WeightedSumKernel<CompressedRealVector> CompressedWeightedSumKernel;
 
}
#endif