LinearSAGTrainer.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Generic Stochastic Average Gradient Descent training for linear models
6 *
7 *
8 *
9 *
10 * \author O. Krause
11 * \date 2016
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36
37#ifndef SHARK_ALGORITHMS_LinearSAGTrainer_H
38#define SHARK_ALGORITHMS_LinearSAGTrainer_H
39
40
46#include <shark/Data/DataView.h>
47
48
49namespace shark
50{
51
52namespace detail{
53 template<class InputType, class LabelType>
54 struct LinearSAGTrainerBase{
55 typedef AbstractWeightedTrainer< LinearModel<InputType>,LabelType > type;
56 typedef AbstractLoss<LabelType,RealVector> LossType;
57 };
58 template<class InputType>
59 struct LinearSAGTrainerBase<InputType, unsigned int>{
60 typedef AbstractWeightedTrainer< LinearClassifier<InputType>, unsigned int > type;
61 typedef AbstractLoss<unsigned int,RealVector> LossType;
62 };
63}
64
65
66///
67/// \brief Stochastic Average Gradient Method for training of linear models,
68///
69/// Given a differentiable loss function L(f, y) and a model f_j(x)= w_j^Tx+b
70/// this trainer solves the regularized risk minimization problem
71/// \f[
72/// \min \frac{1}{2} \sum_j \frac{\lambda}{2}\|w_j\|^2 + \frac 1 {\ell} \sum_i L(y_i, f(x_i)),
73/// \f]
74/// where i runs over training data, j over the model outputs, and lambda > 0 is the
75/// regularization parameter.
76///
77/// The algorithm uses averaging of the algorithm to obtain a good estimate of the gradient.
78/// Averaging is performed by summing over the last gradient value obtained for each data point.
79/// At the beginning this estimate is far off as old gradient values are outdated, but as the
80/// algorithm converges, this gives linear convergence on strictly convex functions
81/// and O(1/T) convergence on not-strictly convex functions.
82///
83/// The algorithm supports classification and regresseion, dense and sparse inputs
84/// and weighted and unweighted datasets
85/// Reference:
86/// Schmidt, Mark, Nicolas Le Roux, and Francis Bach.
87/// "Minimizing finite sums with the stochastic average gradient."
88/// arXiv preprint arXiv:1309.2388 (2013).
89/// \ingroup supervised_trainer
90template <class InputType, class LabelType>
91class LinearSAGTrainer : public detail::LinearSAGTrainerBase<InputType,LabelType>::type, public IParameterizable<>
92{
93private:
95public:
96 typedef typename Base::ModelType ModelType;
99
100
101 /// \brief Constructor
102 ///
103 /// \param loss (sub-)differentiable loss function
104 /// \param lambda regularization parameter fort wo-norm regularization, 0 by default
105 /// \param offset whether to train with offset/bias parameter or not, default is true
106 LinearSAGTrainer(LossType const* loss, double lambda = 0, bool offset = true)
107 : mep_loss(loss)
108 , m_lambda(lambda)
109 , m_offset(offset)
110 , m_maxEpochs(0)
111 { }
112
113 /// \brief From INameable: return the class name.
114 std::string name() const
115 { return "LinearSAGTrainer"; }
116
117 using Base::train;
118 void train(ModelType& model, WeightedDatasetType const& dataset){
119 trainImpl(random::globalRng, model,dataset,*mep_loss);
120 }
121
122
123 /// \brief Return the number of training epochs.
124 /// A value of 0 indicates that the default of max(10, dimensionOfData) should be used.
125 std::size_t epochs() const
126 { return m_maxEpochs; }
127
128 /// \brief Set the number of training epochs.
129 /// A value of 0 indicates that the default of max(10, dimensionOfData) should be used.
130 void setEpochs(std::size_t value)
131 { m_maxEpochs = value; }
132
133
134 /// \brief Return the value of the regularization parameter lambda.
135 double lambda() const
136 { return m_lambda; }
137
138 /// \brief Set the value of the regularization parameter lambda.
139 void setLambda(double lambda)
140 { m_lambda = lambda; }
141
142 /// \brief Check whether the model to be trained should include an offset term.
143 bool trainOffset() const
144 { return m_offset; }
145
146 /// \brief Sets whether the model to be trained should include an offset term.
147 void setTrainOffset(bool offset)
148 { m_offset = offset;}
149
150 /// \brief Returns the vector of hyper-parameters(same as lambda)
151 RealVector parameterVector() const
152 {
153 return RealVector(1,m_lambda);
154 }
155
156 /// \brief Sets the vector of hyper-parameters(same as lambda)
157 void setParameterVector(RealVector const& newParameters)
158 {
159 SIZE_CHECK(newParameters.size() == 1);
160 m_lambda = newParameters(0);
161 }
162
163 ///\brief Returns the number of hyper-parameters.
164 size_t numberOfParameters() const
165 {
166 return 1;
167 }
168
169private:
170 //initializes the model in the classification case and calls iterate to train it
171 void trainImpl(
172 random::rng_type& rng,
173 LinearClassifier<InputType>& classifier,
176 ){
177 //initialize model
178 std::size_t classes = numberOfClasses(dataset);
179 if(classes == 2) classes = 1;//special case: 2D classification is always encoded by the sign of the output
180 std::size_t dim = inputDimension(dataset);
181 auto& model = classifier.decisionFunction();
182 model.setStructure(dim,classes, m_offset);
183
184 iterate(rng, model,dataset,loss);
185 }
186 //initializes the model in the regression case and calls iterate to train it
187 template<class LabelT>
188 void trainImpl(
189 random::rng_type& rng,
190 LinearModel<InputType>& model,
191 WeightedLabeledData<InputType, LabelT> const& dataset,
192 AbstractLoss<LabelT,RealVector> const& loss
193 ){
194 //initialize model
195 std::size_t labelDim = labelDimension(dataset);
196 std::size_t dim = inputDimension(dataset);
197 model.setStructure(dim,labelDim, m_offset);
198 iterate(rng, model,dataset,loss);
199 }
200
201 //dense vector case is easier, mostly implemented for simple exposition of the algorithm
202 template<class T>
203 void iterate(
204 random::rng_type& rng,
205 LinearModel<blas::vector<T> >& model,
206 WeightedLabeledData<blas::vector<T>, LabelType> const& dataset,
207 AbstractLoss<LabelType,RealVector> const& loss
208 ){
209
210 //get stats of the dataset
211 DataView<LabeledData<InputType, LabelType> const> data(dataset.data());
212 std::size_t ell = data.size();
213 std::size_t labelDim = model.outputShape().numElements();
214 std::size_t dim = model.inputShape().numElements();
215
216 //set number of iterations
217 std::size_t iterations = m_maxEpochs * ell;
218 if(m_maxEpochs == 0)
219 iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
220
221 //picking distribution picks proportional to weight
222 RealVector probabilities = createBatch(dataset.weights().elements());
223 probabilities /= sum(probabilities);
224 MultiNomialDistribution dist(probabilities);
225
226 //variables used for the SAG loop
227 RealMatrix gradD(labelDim,ell,0); // gradients of regularized loss minimization with a linear model have the form sum_i D_i*x_i. We store the last acquired estimate
228 RealMatrix grad(labelDim,dim);// gradient of the weight matrix.
229 RealVector gradOffset(labelDim,0); //sum_i D_i, gradient estimate for the offset
230 RealVector pointNorms(ell); //norm of each point in the dataset
231 for(std::size_t i = 0; i != ell; ++i){
232 pointNorms(i) = norm_sqr(data[i].input);
233 }
234 // preinitialize everything to prevent costly memory allocations in the loop
235 RealVector f_b(labelDim, 0.0); // prediction of the model
236 RealVector derivative(labelDim, 0.0); //derivative of the loss
237 double L = 1; // initial estimate for the lipschitz-constant
238
239 // SAG loop
240 for(std::size_t iter = 0; iter < iterations; iter++)
241 {
242 // choose data point
243 std::size_t b = dist(rng);
244
245 // compute prediction
246 noalias(f_b) = prod(model.matrix(), data[b].input);
247 if(m_offset) noalias(f_b) += model.offset();
248
249 // compute loss gradient
250 double currentValue = loss.evalDerivative(data[b].label, f_b, derivative);
251
252 //update gradient (needs to be multiplied with kappa)
253 noalias(grad) += probabilities(b) * outer_prod(derivative-column(gradD,b), data[b].input);
254 if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
255 noalias(column(gradD,b)) = derivative; //we got a new estimate for D of element b.
256
257 // update gradient
258 double eta = 1.0/(L+m_lambda);
259 noalias(model.matrix()) *= 1 - eta * m_lambda;//2-norm regularization
260 for(std::size_t i = 0; i != labelDim; ++i){
261 for(std::size_t j = 0; j != dim; ++j){
262 model.matrix()(i,j) -= eta*grad(i,j);
263 }
264 }
265 //~ noalias(model.matrix()) -= eta * grad;
266 if(m_offset) noalias(model.offset()) -= eta * gradOffset;
267
268 //line-search procedure, 4.6 in the paper
269 noalias(f_b) -= derivative/L*pointNorms(b);
270 double newValue = loss.eval(data[b].label, f_b);
271 if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
272 L *= 2;
273 }
274 L*= std::pow(2.0,-1.0/ell);//allow L to slightly shrink in case our initial estimate was too large
275 }
276 }
277
278 template<class T>
279 void iterate(
280 random::rng_type& rng,
281 LinearModel<blas::compressed_vector<T> >& model,
282 WeightedLabeledData<blas::compressed_vector<T>, LabelType> const& dataset,
283 AbstractLoss<LabelType,RealVector> const& loss
284 ){
285
286 //get stats of the dataset
287 DataView<LabeledData<InputType, LabelType> const> data(dataset.data());
288 std::size_t ell = data.size();
289 std::size_t labelDim = model.outputSize();
290 std::size_t dim = model.inputSize();
291
292 //set number of iterations
293 std::size_t iterations = m_maxEpochs * ell;
294 if(m_maxEpochs == 0)
295 iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
296
297 //picking distribution picks proportional to weight
298 RealVector probabilities = createBatch(dataset.weights().elements());
299 probabilities /= sum(probabilities);
300 MultiNomialDistribution dist(probabilities);
301
302 //variables used for the SAG loop
303 blas::matrix<double,blas::column_major> gradD(labelDim,ell,0); // gradients of regularized loss minimization with a linear model have the form sum_i D_i*x_i. We store the last acquired estimate
304 RealMatrix grad(labelDim,dim);// gradient of the weight matrix.
305 RealVector gradOffset(labelDim,0); //sum_i D_i, gradient estimate for the offset
306 RealVector pointNorms(ell); //norm of each point in the dataset
307 for(std::size_t i = 0; i != ell; ++i){
308 pointNorms(i) = norm_sqr(data[i].input);
309 }
310 // preinitialize everything to prevent costly memory allocations in the loop
311 RealVector f_b(labelDim, 0.0); // prediction of the model
312 RealVector derivative(labelDim, 0.0); //derivative of the loss
313 double kappa =1; //we store the matrix as kappa*model.matrix() where kappa stores the effect of the 2-norm regularisation
314 double L = 1; // initial estimate for the lipschitz-constant
315
316 //just in time updating for sparse inputs.
317 //we need to store a running sum of step lengths which we can then apply whenever an index got updated
318 RealVector appliedRates(dim,0.0);//applied steps since the last reset for each dimension
319 double stepsCumSum = 0.0;// cumulative update steps
320
321
322 // SAG loop
323 for(std::size_t iter = 0; iter < iterations; iter++)
324 {
325 // choose data point
326 std::size_t b = dist(rng);
327 auto const& point = data[b];
328
329 //just in time update of the previous steps for every nonzero element of point b
330 auto end = point.input.end();
331 for(auto pos = point.input.begin(); pos != end; ++pos){
332 std::size_t index = pos.index();
333 noalias(column(model.matrix(),index)) -= (stepsCumSum - blas::repeat(appliedRates(index),labelDim))*column(grad,index);
334 appliedRates(index) = stepsCumSum;
335 }
336
337 // compute prediction
338 noalias(f_b) = kappa * prod(model.matrix(), point.input);
339 if(m_offset) noalias(f_b) += model.offset();
340
341 // compute loss gradient
342 double currentValue = loss.evalDerivative(point.label, f_b, derivative);
343
344 //update gradient (needs to be multiplied with kappa)
345 //~ noalias(grad) += probabilities(b) * outer_prod(derivative-column(gradD,b), point.input); //todo: this is slow for some reason.
346 for(std::size_t l = 0; l != derivative.size(); ++l){
347 double val = probabilities(b) * (derivative(l) - gradD(l,b));
348 noalias(row(grad,l)) += val * point.input;
349 }
350
351 if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
352 noalias(column(gradD,b)) = derivative; //we got a new estimate for D of element b.
353
354 // update gradient
355 double eta = 1.0/(L+m_lambda);
356 stepsCumSum += kappa * eta;//we delay update of the matrix
357 if(m_offset) noalias(model.offset()) -= eta * gradOffset;
358 kappa *= 1 - eta * m_lambda;//2-norm regularization
359
360 //line-search procedure, 4.6 in the paper
361 noalias(f_b) -= derivative/L*pointNorms(b);
362 double newValue = loss.eval(point.label, f_b);
363 if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
364 L *= 2;
365 }
366 L*= std::pow(2.0,-1.0/ell);//allow L to slightly shrink in case our initial estimate was too large
367
368
369 //every epoch we reset the internal variables for numeric stability and apply all outstanding updates
370 //this also ensures that in the last iteration all updates are applied (as iterations is a multiple of ell)
371 if((iter +1)% ell == 0){
372 noalias(model.matrix()) -= (stepsCumSum - blas::repeat(appliedRates,labelDim))*grad;
373 model.matrix() *= kappa;
374 kappa = 1;
375 stepsCumSum = 0.0;
376 appliedRates.clear();
377 }
378 }
379 }
380
381 LossType const* mep_loss; ///< pointer to loss function
382 double m_lambda; ///< regularization parameter
383 bool m_offset; ///< should the resulting model have an offset term?
384 std::size_t m_maxEpochs; ///< number of training epochs (sweeps over the data), or 0 for default = max(10, C)
385};
386
387}
388#endif