115 {
return "LinearSAGTrainer"; }
126 {
return m_maxEpochs; }
131 { m_maxEpochs = value; }
148 { m_offset = offset;}
153 return RealVector(1,m_lambda);
160 m_lambda = newParameters(0);
172 random::rng_type& rng,
179 if(classes == 2) classes = 1;
182 model.setStructure(dim,classes, m_offset);
184 iterate(rng, model,dataset,loss);
187 template<
class LabelT>
189 random::rng_type& rng,
190 LinearModel<InputType>& model,
191 WeightedLabeledData<InputType, LabelT>
const& dataset,
192 AbstractLoss<LabelT,RealVector>
const& loss
197 model.setStructure(dim,labelDim, m_offset);
198 iterate(rng, model,dataset,loss);
204 random::rng_type& rng,
205 LinearModel<blas::vector<T> >& model,
206 WeightedLabeledData<blas::vector<T>,
LabelType>
const& dataset,
207 AbstractLoss<LabelType,RealVector>
const& loss
211 DataView<LabeledData<InputType, LabelType>
const> data(dataset.data());
212 std::size_t ell = data.size();
213 std::size_t labelDim = model.outputShape().numElements();
214 std::size_t dim = model.inputShape().numElements();
217 std::size_t iterations = m_maxEpochs * ell;
219 iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
222 RealVector probabilities =
createBatch(dataset.weights().elements());
223 probabilities /= sum(probabilities);
224 MultiNomialDistribution dist(probabilities);
227 RealMatrix gradD(labelDim,ell,0);
228 RealMatrix grad(labelDim,dim);
229 RealVector gradOffset(labelDim,0);
230 RealVector pointNorms(ell);
231 for(std::size_t i = 0; i != ell; ++i){
232 pointNorms(i) = norm_sqr(data[i].input);
235 RealVector f_b(labelDim, 0.0);
236 RealVector derivative(labelDim, 0.0);
240 for(std::size_t iter = 0; iter < iterations; iter++)
243 std::size_t b = dist(rng);
246 noalias(f_b) = prod(model.matrix(), data[b].input);
247 if(m_offset) noalias(f_b) += model.offset();
250 double currentValue = loss.evalDerivative(data[b].label, f_b, derivative);
253 noalias(grad) += probabilities(b) * outer_prod(derivative-column(gradD,b), data[b].input);
254 if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
255 noalias(column(gradD,b)) = derivative;
258 double eta = 1.0/(L+m_lambda);
259 noalias(model.matrix()) *= 1 - eta * m_lambda;
260 for(std::size_t i = 0; i != labelDim; ++i){
261 for(std::size_t j = 0; j != dim; ++j){
262 model.matrix()(i,j) -= eta*grad(i,j);
266 if(m_offset) noalias(model.offset()) -= eta * gradOffset;
269 noalias(f_b) -= derivative/L*pointNorms(b);
270 double newValue = loss.eval(data[b].label, f_b);
271 if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
274 L*= std::pow(2.0,-1.0/ell);
280 random::rng_type& rng,
281 LinearModel<blas::compressed_vector<T> >& model,
282 WeightedLabeledData<blas::compressed_vector<T>,
LabelType>
const& dataset,
283 AbstractLoss<LabelType,RealVector>
const& loss
287 DataView<LabeledData<InputType, LabelType>
const> data(dataset.data());
288 std::size_t ell = data.size();
289 std::size_t labelDim = model.outputSize();
290 std::size_t dim = model.inputSize();
293 std::size_t iterations = m_maxEpochs * ell;
295 iterations = std::max(10 * ell, std::size_t(std::ceil(dim * ell)));
298 RealVector probabilities =
createBatch(dataset.weights().elements());
299 probabilities /= sum(probabilities);
300 MultiNomialDistribution dist(probabilities);
303 blas::matrix<double,blas::column_major> gradD(labelDim,ell,0);
304 RealMatrix grad(labelDim,dim);
305 RealVector gradOffset(labelDim,0);
306 RealVector pointNorms(ell);
307 for(std::size_t i = 0; i != ell; ++i){
308 pointNorms(i) = norm_sqr(data[i].input);
311 RealVector f_b(labelDim, 0.0);
312 RealVector derivative(labelDim, 0.0);
318 RealVector appliedRates(dim,0.0);
319 double stepsCumSum = 0.0;
323 for(std::size_t iter = 0; iter < iterations; iter++)
326 std::size_t b = dist(rng);
327 auto const& point = data[b];
330 auto end = point.input.end();
331 for(
auto pos = point.input.begin(); pos != end; ++pos){
332 std::size_t index = pos.index();
333 noalias(column(model.matrix(),index)) -= (stepsCumSum - blas::repeat(appliedRates(index),labelDim))*column(grad,index);
334 appliedRates(index) = stepsCumSum;
338 noalias(f_b) = kappa * prod(model.matrix(), point.input);
339 if(m_offset) noalias(f_b) += model.offset();
342 double currentValue = loss.evalDerivative(point.label, f_b, derivative);
346 for(std::size_t l = 0; l != derivative.size(); ++l){
347 double val = probabilities(b) * (derivative(l) - gradD(l,b));
348 noalias(row(grad,l)) += val * point.input;
351 if(m_offset) noalias(gradOffset) += probabilities(b) *(derivative-column(gradD,b));
352 noalias(column(gradD,b)) = derivative;
355 double eta = 1.0/(L+m_lambda);
356 stepsCumSum += kappa * eta;
357 if(m_offset) noalias(model.offset()) -= eta * gradOffset;
358 kappa *= 1 - eta * m_lambda;
361 noalias(f_b) -= derivative/L*pointNorms(b);
362 double newValue = loss.eval(point.label, f_b);
363 if(norm_sqr(derivative)*pointNorms(b) > 1.e-8 && newValue > currentValue - 1/(2*L)*norm_sqr(derivative)*pointNorms(b)){
366 L*= std::pow(2.0,-1.0/ell);
371 if((iter +1)% ell == 0){
372 noalias(model.matrix()) -= (stepsCumSum - blas::repeat(appliedRates,labelDim))*grad;
373 model.matrix() *= kappa;
376 appliedRates.clear();
384 std::size_t m_maxEpochs;