CrossEntropy.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Error measure for classification tasks that can be used
6 * as the objective function for training.
7 *
8 *
9 *
10 *
11 * \author -
12 * \date -
13 *
14 *
15 * \par Copyright 1995-2017 Shark Development Team
16 * 3
17 * <BR><HR>
18 * This file is part of Shark.
19 * <https://shark-ml.github.io/Shark/>
20 *
21 * Shark is free software: you can redistribute it and/or modify
22 * it under the terms of the GNU Lesser General Public License as published
23 * by the Free Software Foundation, either version 3 of the License, or
24 * (at your option) any later version.
25 *
26 * Shark is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU Lesser General Public License for more details.
30 *
31 * You should have received a copy of the GNU Lesser General Public License
32 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33 *
34 */
35
36#ifndef SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
37#define SHARK_OBJECTIVEFUNCTIONS_LOSS_CROSS_ENTROPY_H
38
40
41namespace shark{
42
43/// \brief Error measure for classification tasks that can be used
44/// as the objective function for training.
45///
46/// If your model should return a vector whose components reflect the
47/// logarithmic conditional probabilities of class membership given any input vector
48/// 'CrossEntropy' is the adequate error measure for model-training.
49/// For \em C>1 classes the loss function is defined as
50/// \f[
51/// E = - \ln \frac{\exp{x_c}} {\sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}} = - x_c + \ln \sum_{c^{\prime}=1}^C \exp{x_c^{\prime}}
52/// \f]
53/// where \em x is the prediction vector of the model and \em c is the class label. In the case of only one
54/// model output and binary classification, another more numerically stable formulation is used:
55/// \f[
56/// E = \ln(1+ e^{-yx})
57/// \f]
58/// here, \em y are class labels between -1 and 1 and y = -2 c+1. The reason why this is numerically more stable is,
59/// that when \f$ e^{-yx} \f$ is big, the error function is well approximated by the linear function \em x. Also if
60/// the exponential is very small, the case \f$ \ln(0) \f$ is avoided.
61///
62/// If the class labels are integers, they must be starting from 0. If class labels are vectors, there must be a proper
63/// probability vector. i.e. values must be bigger or equal to zero and sum to one. This incldues one-hot-encoding of labels.
64/// Also for theoretical reasons, the output neurons of a neural Network that is trained with this loss should be linear.
65/// \ingroup lossfunctions
66
67template<class LabelType, class OutputType>
69
70
71template<class OutputType>
72class CrossEntropy<unsigned int, OutputType> : public AbstractLoss<unsigned int,OutputType>
73{
74private:
76 typedef typename base_type::ConstLabelReference ConstLabelReference;
77 typedef typename base_type::ConstOutputReference ConstOutputReference;
78 typedef typename base_type::BatchOutputType BatchOutputType;
79 typedef typename base_type::MatrixType MatrixType;
80
81 //uses different formula to compute the binary case for 1 output.
82 //should be numerically more stable
83 //formula: ln(1+exp(-yx)) with y = -1/1
84 double evalError(double label,double exponential,double value) const {
85
86 if(value*label < -200 ){
87 //below this, we might get numeric instabilities
88 //but we know, that ln(1+exp(x)) converges to x for big arguments
89 return - value * label;
90 }
91 return std::log(1+exponential);
92 }
93public:
95 { this->m_features |= base_type::HAS_FIRST_DERIVATIVE;}
96
97
98 /// \brief From INameable: return the class name.
99 std::string name() const
100 { return "CrossEntropy"; }
101
102 // annoyingness of C++ templates
103 using base_type::eval;
104
105 double eval(UIntVector const& target, BatchOutputType const& prediction) const {
106 double error = 0;
107 for(std::size_t i = 0; i != prediction.size1(); ++i){
108 error += eval(target(i), row(prediction,i));
109 }
110 return error;
111 }
112
113 double eval( ConstLabelReference target, ConstOutputReference prediction)const{
114 if ( prediction.size() == 1 )
115 {
116 RANGE_CHECK ( target < 2 );
117 double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
118 double exponential = std::exp( -label * prediction(0) );
119 return evalError(label,exponential,prediction(0 ));
120 }else{
121 RANGE_CHECK ( target < prediction.size() );
122
123 //calculate the log norm in a numerically stable way
124 //we subtract the maximum prior to exponentiation to
125 //ensure that the exponentiation result will still fit in double
126 double maximum = max(prediction);
127 double logNorm = sum(exp(prediction-maximum));
128 logNorm = std::log(logNorm) + maximum;
129 return logNorm - prediction(target);
130 }
131 }
132
133 double evalDerivative(UIntVector const& target, BatchOutputType const& prediction, BatchOutputType& gradient) const {
134 gradient.resize(prediction.size1(),prediction.size2());
135 if ( prediction.size2() == 1 )
136 {
137 double error = 0;
138 for(std::size_t i = 0; i != prediction.size1(); ++i){
139 RANGE_CHECK ( target(i) < 2 );
140 double label = 2 * static_cast<double>(target(i)) - 1; //converts labels from 0/1 to -1/1
141 double exponential = std::exp ( -label * prediction (i, 0 ) );
142 double sigmoid = 1.0/(1.0+exponential);
143 gradient ( i,0 ) = -label * (1.0 - sigmoid);
144 error+=evalError(label,exponential,prediction (i, 0 ));
145 }
146 return error;
147 }else{
148 double error = 0;
149 for(std::size_t i = 0; i != prediction.size1(); ++i){
150 RANGE_CHECK ( target(i) < prediction.size2() );
151 auto gradRow=row(gradient,i);
152
153 //calculate the log norm in a numerically stable way
154 //we subtract the maximum prior to exponentiation to
155 //ensure that the exponentiation result will still fit in double
156 //this does not change the result as the values get normalized by
157 //their sum and thus the correction term cancels out.
158 double maximum = max(row(prediction,i));
159 noalias(gradRow) = exp(row(prediction,i) - maximum);
160 double norm = sum(gradRow);
161 gradRow/=norm;
162 gradient(i,target(i)) -= 1;
163 error+=std::log(norm) - prediction(i,target(i))+maximum;
164 }
165 return error;
166 }
167 }
168 double evalDerivative(ConstLabelReference target, ConstOutputReference prediction, OutputType& gradient) const {
169 gradient.resize(prediction.size());
170 if ( prediction.size() == 1 ){
171 RANGE_CHECK ( target < 2 );
172 double label = 2.0 * target - 1; //converts labels from 0/1 to -1/1
173 double exponential = std::exp ( - label * prediction(0));
174 double sigmoid = 1.0/(1.0+exponential);
175 gradient(0) = -label * (1.0 - sigmoid);
176 return evalError(label,exponential,prediction(0));
177 }else{
178 RANGE_CHECK ( target < prediction.size() );
179
180 //calculate the log norm in a numerically stable way
181 //we subtract the maximum prior to exponentiation to
182 //ensure that the exponentiation result will still fit in double
183 //this does not change the result as the values get normalized by
184 //their sum and thus the correction term cancels out.
185 double maximum = max(prediction);
186 noalias(gradient) = exp(prediction - maximum);
187 double norm = sum(gradient);
188 gradient /= norm;
189 gradient(target) -= 1;
190 return std::log(norm) - prediction(target) + maximum;
191 }
192 }
193
195 ConstLabelReference target, ConstOutputReference prediction,
196 BatchOutputType& gradient,MatrixType & hessian
197 ) const {
198 gradient.resize(prediction.size());
199 hessian.resize(prediction.size(),prediction.size());
200 if ( prediction.size() == 1 )
201 {
202 RANGE_CHECK ( target < 2 );
203 double label = 2 * static_cast<double>(target) - 1; //converts labels from 0/1 to -1/1
204 double exponential = std::exp ( -label * prediction ( 0 ) );
205 double sigmoid = 1.0/(1.0+exponential);
206 gradient ( 0 ) = -label * (1.0-sigmoid);
207 hessian ( 0,0 ) = sigmoid * ( 1-sigmoid );
208 return evalError(label,exponential,prediction ( 0 ));
209 }
210 else
211 {
212 RANGE_CHECK ( target < prediction.size() );
213 //calculate the log norm in a numerically stable way
214 //we subtract the maximum prior to exponentiation to
215 //ensure that the exponentiation result will still fit in double
216 //this does not change the result as the values get normalized by
217 //their sum and thus the correction term cancels out.
218 double maximum = max(prediction);
219 noalias(gradient) = exp(prediction-maximum);
220 double norm = sum(gradient);
221 gradient/=norm;
222
223 noalias(hessian)=-outer_prod(gradient,gradient);
224 noalias(diag(hessian)) += gradient;
225 gradient(target) -= 1;
226
227 return std::log(norm) - prediction(target) - maximum;
228 }
229 }
230};
231
232
233template<class T, class Device>
234class CrossEntropy<blas::vector<T, Device>, blas::vector<T, Device> >
235: public AbstractLoss<blas::vector<T, Device>, blas::vector<T, Device>>
236{
237private:
238 typedef blas::vector<T, Device> OutputType;
240 typedef typename base_type::ConstLabelReference ConstLabelReference;
241 typedef typename base_type::ConstOutputReference ConstOutputReference;
242 typedef typename base_type::BatchOutputType BatchOutputType;
243 typedef typename base_type::MatrixType MatrixType;
244public:
246 { this->m_features |= base_type::HAS_FIRST_DERIVATIVE;}
247
248
249 /// \brief From INameable: return the class name.
250 std::string name() const
251 { return "CrossEntropy"; }
252
253 // annoyingness of C++ templates
254 using base_type::eval;
255
256 double eval(BatchOutputType const& target, BatchOutputType const& prediction) const {
257 SIZE_CHECK(target.size1() == prediction.size1());
258 SIZE_CHECK(target.size2() == prediction.size2());
259 std::size_t m = target.size2();
260
261 OutputType maximum = max(as_rows(prediction));
262 auto safeExp = exp(prediction - trans(blas::repeat(maximum, m)));
263 OutputType norm = sum(as_rows(safeExp));
264 double error = sum(log(norm)) - sum(target * prediction) + sum(maximum);
265 return error;
266 }
267
268 double evalDerivative(BatchOutputType const& target, BatchOutputType const& prediction, BatchOutputType& gradient) const {
269 gradient.resize(prediction.size1(),prediction.size2());
270 std::size_t m = target.size2();
271
272 OutputType maximum = max(as_rows(prediction));
273 noalias(gradient) = exp(prediction - trans(blas::repeat(maximum, m)));
274 OutputType norm = sum(as_rows(gradient));
275 noalias(gradient) /= trans(blas::repeat(norm, m));
276 noalias(gradient) -= target;
277 double error = sum(log(norm)) - sum(target * prediction) + sum(maximum);
278 return error;
279 }
280};
281
282
283}
284#endif