include/shark/Unsupervised/RBM/GradientApproximations/ContrastiveDivergence.h Source File

Go to the documentation of this file.
/*!
 * 
 *
 * \brief       -
 *
 * \author      -
 * \date        -
 *
 *
 * \par Copyright 1995-2017 Shark Development Team
 * 
 * <BR><HR>
 * This file is part of Shark.
 * <https://shark-ml.github.io/Shark/>
 * 
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published 
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
#ifndef SHARK_UNSUPERVISED_RBM_GRADIENTAPPROXIMATIONS_CONTRASTIVEDIVERGENCE_H
#define SHARK_UNSUPERVISED_RBM_GRADIENTAPPROXIMATIONS_CONTRASTIVEDIVERGENCE_H
 
#include <shark/ObjectiveFunctions/AbstractObjectiveFunction.h>
#include <shark/Unsupervised/RBM/Energy.h>
 
namespace shark{
 
/// \brief Implements k-step Contrastive Divergence described by Hinton et al. (2006).
///
/// k-step Contrastive Divergence approximates the gradient by initializing a Gibbs
/// chain with a training example and run it for k steps. 
/// The sample gained after k steps than samples is than used to approximate the mean of the RBM distribution in the gradient.
template<class Operator>    
class ContrastiveDivergence: public SingleObjectiveFunction{
public:
    typedef typename Operator::RBM RBM;
    
    /// \brief The constructor 
    ///
    ///@param rbm pointer to the RBM which shell be trained 
    ContrastiveDivergence(RBM* rbm)
    : mpe_rbm(rbm),m_operator(rbm)
    , m_k(1), m_numBatches(0),m_regularizer(0){
        SHARK_ASSERT(rbm != NULL);
 
        m_features.reset(HAS_VALUE);
        m_features |= HAS_FIRST_DERIVATIVE;
        m_features |= CAN_PROPOSE_STARTING_POINT;
    };
 
    /// \brief From INameable: return the class name.
    std::string name() const
    { return "ContrastiveDivergence"; }
 
    /// \brief Sets the training batch.
    ///
    /// @param data the batch of training data
    void setData(UnlabeledData<RealVector> const& data){
        m_data = data;
    }
    
    /// \brief Sets the value of k- the number of steps of the Gibbs Chain 
    ///
    /// @param k  the number of steps
    void setK(unsigned int k){
        m_k = k;
    }
 
    SearchPointType proposeStartingPoint() const{
        return  mpe_rbm->parameterVector();
    }
    
    /// \brief Returns the number of variables of the RBM.
    ///
    /// @return the number of variables of the RBM
    std::size_t numberOfVariables()const{
        return mpe_rbm->numberOfParameters();
    }
    
    /// \brief Returns the number of batches of the dataset that are used in every iteration.
    ///
    /// If it is less than all batches, the batches are chosen at random. if it is 0, all batches are used
    std::size_t numBatches()const{
        return m_numBatches;
    }
    
    /// \brief Returns a reference to the number of batches of the dataset that are used in every iteration.
    ///
    /// If it is less than all batches, the batches are chosen at random.if it is 0, all batches are used.
    std::size_t& numBatches(){
        return m_numBatches;
    }
    
    void setRegularizer(double factor, SingleObjectiveFunction* regularizer){
        m_regularizer = regularizer;
        m_regularizationStrength = factor;
    }
    
    /// \brief Gives the CD-k approximation of the log-likelihood gradient.
    ///
    /// @param parameter the actual parameters of the RBM
    /// @param derivative holds later the CD-k approximation of the log-likelihood gradient
    double evalDerivative( SearchPointType const & parameter, FirstOrderDerivative & derivative ) const{
        mpe_rbm->setParameterVector(parameter);
        derivative.resize(mpe_rbm->numberOfParameters());
        derivative.clear();
        
        std::size_t batchesForTraining = m_numBatches > 0? m_numBatches: m_data.numberOfBatches();
        std::size_t elements = 0;
        //get the batches for this iteration
        std::vector<std::size_t> batchIds(m_data.numberOfBatches());
        {
            for(std::size_t i = 0; i != m_data.numberOfBatches(); ++i){
                batchIds[i] = i;
            }
            std::shuffle(batchIds.begin(),batchIds.end(),mpe_rbm->rng());
            for(std::size_t i = 0; i != batchesForTraining; ++i){
                elements += m_data.batch(batchIds[i]).size1();
            }
        }
        
        std::size_t threads = std::min<std::size_t>(batchesForTraining,SHARK_NUM_THREADS);
        std::size_t numBatches = batchesForTraining/threads;
        
        
        SHARK_PARALLEL_FOR(int t = 0; t < (int)threads; ++t){
            typename RBM::GradientType empiricalAverage(mpe_rbm);
            typename RBM::GradientType modelAverage(mpe_rbm);
            
            std::size_t threadElements = 0;
            
            std::size_t batchStart = t*numBatches;
            std::size_t batchEnd = (t== (int)threads-1)? batchesForTraining : batchStart+numBatches;
            for(std::size_t i = batchStart; i != batchEnd; ++i){
                RealMatrix const& batch = m_data.batch(batchIds[i]);
                threadElements += batch.size1();
                
                //create the batches for evaluation
                typename Operator::HiddenSampleBatch hiddenBatch(batch.size1(),mpe_rbm->numberOfHN());
                typename Operator::VisibleSampleBatch visibleBatch(batch.size1(),mpe_rbm->numberOfVN());
                
                visibleBatch.state = batch;
                m_operator.precomputeHidden(hiddenBatch,visibleBatch,blas::repeat(1.0,batch.size1()));
                m_operator.sampleHidden(hiddenBatch);
                empiricalAverage.addVH(hiddenBatch,visibleBatch);
                
                for(std::size_t step = 0; step != m_k; ++step){
                    m_operator.precomputeVisible(hiddenBatch, visibleBatch,blas::repeat(1.0,batch.size1()));
                    m_operator.sampleVisible(visibleBatch);
                    m_operator.precomputeHidden(hiddenBatch, visibleBatch,blas::repeat(1.0,batch.size1()));
                    if( step != m_k-1){
                        m_operator.sampleHidden(hiddenBatch);
                    }
                }
                modelAverage.addVH(hiddenBatch,visibleBatch);
            }
            SHARK_CRITICAL_REGION{
                double weight = threadElements/double(elements);
                noalias(derivative) += weight*(modelAverage.result() - empiricalAverage.result());
            }
            
        }
        
        if(m_regularizer){
            FirstOrderDerivative regularizerDerivative;
            m_regularizer->evalDerivative(parameter,regularizerDerivative);
            noalias(derivative) += m_regularizationStrength*regularizerDerivative;
        }
        
        return std::numeric_limits<double>::quiet_NaN();
    }
 
private:    
    UnlabeledData<RealVector> m_data;
    RBM* mpe_rbm;
    Operator m_operator;
    unsigned int m_k;
    std::size_t m_numBatches;///< number of batches used in every iteration. 0 means all.
 
    SingleObjectiveFunction* m_regularizer;
    double m_regularizationStrength;
};  
    
}
 
#endif