LassoRegression.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief LASSO Regression
6 *
7 *
8 *
9 * \author T. Glasmachers
10 * \date 2013
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35
36#ifndef SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
37#define SHARK_ALGORITHMS_TRAINERS_LASSOREGRESSION_H
38
41#include <cmath>
42
43
44namespace shark {
45
46
47
48/// \brief LASSO Regression
49///
50/// LASSO Regression extracts a sparse vector of regression
51/// coefficients. The original method amounts to L1-constrained
52/// least squares regression, while this implementation uses an
53/// L1 penalty instead of a constraint (which is equivalent).
54///
55/// For data vectors \f$ x_i \f$ with real-valued labels \f$ y_i \f$
56/// the trainer solves the problem
57/// \f$ \min_w \quad \frac{1}{2} \sum_i (w^T x_i - y_i)^2 + \lambda \|w\|_1 \f$.
58/// The target accuracy of the solution is measured in terms of the
59/// smallest component of the gradient of the objective function.
60///
61/// The trainer has one template parameter, namely the type of
62/// the input vectors \f$ x_i \f$. These need to be vector valued,
63/// typically either RealVector of CompressedRealVector. The
64/// resulting weight vector w is represented by a LinearModel
65/// object. Currently model outputs and labels are restricted to a
66/// single dimension.
67/// \ingroup supervised_trainer
68template <class InputVectorType = RealVector>
69class LassoRegression : public AbstractTrainer<LinearModel<InputVectorType> >, public IParameterizable<>
70{
71public:
74
75 /// \brief Constructor.
76 ///
77 /// \param lambda value of the regularization parameter (see class description)
78 /// \param accuracy stopping criterion for the iterative solver, maximal gradient component of the objective function (see class description)
79 LassoRegression(double lambda, double accuracy = 0.01)
82 {
83 RANGE_CHECK(m_lambda >= 0.0);
85 }
86
87 /// \brief From INameable: return the class name.
88 std::string name() const
89 { return "LASSO regression"; }
90
91
92 /// \brief Return the current setting of the regularization parameter.
93 double lambda() const
94 {
95 return m_lambda;
96 }
97
98 /// \brief Set the regularization parameter.
99 void setLambda(double lambda)
100 {
101 RANGE_CHECK(lambda >= 0.0);
103 }
104
105 /// \brief Return the current setting of the accuracy (maximal gradient component of the optimization problem).
106 double accuracy() const
107 {
108 return m_accuracy;
109 }
110
111 /// \brief Set the accuracy (maximal gradient component of the optimization problem).
113 {
114 RANGE_CHECK(accuracy > 0.0);
116 }
117
118 /// \brief Get the regularization parameter lambda through the IParameterizable interface.
119 RealVector parameterVector() const
120 {
121 return RealVector(1, m_lambda);
122 }
123
124 /// \brief Set the regularization parameter lambda through the IParameterizable interface.
125 void setParameterVector(const RealVector& param)
126 {
127 SIZE_CHECK(param.size() == 1);
128 RANGE_CHECK(param(0) >= 0.0);
129 m_lambda = param(0);
130 }
131
132 /// \brief Return the number of parameters (one in this case).
133 size_t numberOfParameters() const
134 {
135 return 1;
136 }
137
138 /// \brief Train a linear model with LASSO regression.
139 void train(ModelType& model, DataType const& dataset){
140
141 // strategy constants
142 const double CHANGE_RATE = 0.2;
143 const double PREF_MIN = 0.05;
144 const double PREF_MAX = 20.0;
145
146 // console output
147 const bool verbose = false;
148
149 std::size_t dim = inputDimension(dataset);
150 RealVector w(dim, 0.0);
151
152 // transpose the dataset and push it inside a single matrix
153 auto data = createBatch(dataset.inputs().elements());
154 data = trans(data);
155 auto label_mat = createBatch(dataset.labels().elements());
156 RealVector label = column(label_mat,0);
157
158 RealVector diag(dim);
159 RealVector difference = -label;
160 UIntVector index(dim);
161
162 // pre-calculate diagonal matrix entries (feature-wise squared norms)
163 for (size_t i=0; i<dim; i++){
164 diag[i] = norm_sqr(row(data,i));
165 }
166
167 // prepare preferences for scheduling
168 RealVector pref(dim, 1.0);
169 double prefsum = (double)dim;
170
171 // prepare performance monitoring for self-adaptation
172 const double gain_learning_rate = 1.0 / dim;
173 double average_gain = 0.0;
174 bool canstop = true;
175 const double lambda = m_lambda;
176
177 // main optimization loop
178 std::size_t iter = 0;
179 std::size_t steps = 0;
180 while (true)
181 {
182 double maxvio = 0.0;
183
184 // define schedule
185 double psum = prefsum;
186 prefsum = 0.0;
187 int pos = 0;
188
189 for (std::size_t i=0; i<dim; i++)
190 {
191 double p = pref[i];
192 double n;
193 if (psum >= 1e-6 && p < psum)
194 n = (dim - pos) * p / psum;
195 else
196 n = (double)(dim - pos); // for numerical stability
197
198 unsigned int m = (unsigned int)floor(n);
199 double prob = n - m;
200 if ((double)rand() / (double)RAND_MAX < prob) m++;
201 for (std::size_t j=0; j<m; j++)
202 {
203 index[pos] = (unsigned int)i;
204 pos++;
205 }
206 psum -= p;
207 prefsum += p;
208 }
209 for (std::size_t i=0; i<dim; i++)
210 {
211 std::size_t r = rand() % dim;
212 std::swap(index[r], index[i]);
213 }
214
215 steps += dim;
216 for (size_t s=0; s<dim; s++)
217 {
218 std::size_t i = index[s];
219 double a = w[i];
220 double d = diag[i];
221
222 // compute gradient
223 double grad = inner_prod(difference, row(data,i));
224
225 // compute optimal coordinate descent step and corresponding gain
226 double vio = 0.0;
227 double gain = 0.0;
228 double delta = 0.0;
229 if (a == 0.0)
230 {
231 if (grad > lambda)
232 {
233 vio = grad - lambda;
234 delta = -vio / d;
235 gain = 0.5 * d * delta * delta;
236 }
237 else if (grad < -lambda)
238 {
239 vio = -grad - lambda;
240 delta = vio / d;
241 gain = 0.5 * d * delta * delta;
242 }
243 }
244 else if (a > 0.0)
245 {
246 grad += lambda;
247 vio = std::fabs(grad);
248 delta = -grad / d;
249 if (delta < -a)
250 {
251 delta = -a;
252 gain = delta * (grad - 0.5 * d * delta);
253 double g0 = grad - a * d - 2.0 * lambda;
254 if (g0 > 0.0)
255 {
256 double dd = -g0 / d;
257 gain = dd * (grad - 0.5 * d * dd);
258 delta += dd;
259 }
260 }
261 else gain = 0.5 * d * delta * delta;
262 }
263 else
264 {
265 grad -= lambda;
266 vio = std::fabs(grad);
267 delta = -grad / d;
268 if (delta > -a)
269 {
270 delta = -a;
271 gain = delta * (grad - 0.5 * d * delta);
272 double g0 = grad - a * d + 2.0 * lambda;
273 if (g0 < 0.0)
274 {
275 double dd = -g0 / d;
276 gain = dd * (grad - 0.5 * d * dd);
277 delta += dd;
278 }
279 }
280 else gain = 0.5 * d * delta * delta;
281 }
282
283 // update state
284 if (vio > maxvio) maxvio = vio;
285 if (delta != 0.0)
286 {
287 w[i] += delta;
288 noalias(difference) += delta*row(data,i);
289 }
290
291 // update gain-based preferences
292 {
293 if (iter == 0)
294 average_gain += gain / (double)dim;
295 else
296 {
297 double change = CHANGE_RATE * (gain / average_gain - 1.0);
298 double newpref = pref[i] * std::exp(change);
299 newpref = std::min(std::max(newpref, PREF_MIN), PREF_MAX);
300 prefsum += newpref - pref[i];
301 pref[i] = newpref;
302 average_gain = (1.0 - gain_learning_rate) * average_gain + gain_learning_rate * gain;
303 }
304 }
305 }
306 iter++;
307
308 if (maxvio <= m_accuracy)
309 {
310 if (canstop)
311 break;
312 else
313 {
314 // prepare full sweep for a reliable check of the stopping criterion
315 canstop = true;
316 noalias(pref) = blas::repeat(1.0, dim);
317 prefsum = (double)dim;
318 if (verbose) std::cout << "*" << std::flush;
319 }
320 }
321 else
322 {
323 canstop = false;
324 if (verbose) std::cout << "." << std::flush;
325 }
326 }
327
328 // write the weight vector into the model
329 RealMatrix mat(1, w.size());
330 row(mat, 0) = w;
331 model.setStructure(mat);
332 }
333
334protected:
335 double m_lambda; ///< regularization parameter
336 double m_accuracy; ///< gradient accuracy
337};
338
339
340}
341#endif