SvmLogisticInterpretation.h
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Maximum-likelihood model selection for binary support vector machines.
5 *
6 *
7 *
8 * \author M.Tuma, T.Glasmachers
9 * \date 2009-2012
10 *
11 *
12 * \par Copyright 1995-2017 Shark Development Team
13 *
14 * <BR><HR>
15 * This file is part of Shark.
16 * <https://shark-ml.github.io/Shark/>
17 *
18 * Shark is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU Lesser General Public License as published
20 * by the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * Shark is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU Lesser General Public License for more details.
27 *
28 * You should have received a copy of the GNU Lesser General Public License
29 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30 *
31 */
32#ifndef SHARK_ML_SVMLOGISTICINTERPRETATION_H
33#define SHARK_ML_SVMLOGISTICINTERPRETATION_H
34
42
43namespace shark {
44
45/// \brief Maximum-likelihood model selection score for binary support vector machines
46///
47/// \par
48/// This class implements the maximum-likelihood based SVM model selection
49/// procedure presented in the article "Glasmachers and C. Igel. Maximum
50/// Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
51/// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
52/// At this point, only binary C-SVMs are supported.
53/// \par
54/// This class implements an AbstactObjectiveFunction. In detail, it provides
55/// a differentiable measure of how well a C-SVM with given hyperparameters fulfills
56/// the maximum-likelihood score presented in the paper. This error measure can then
57/// be optimized for externally via gradient-based optimizers. In other words, this
58/// class provides a score, not an optimization method or a training algorithm. The
59/// C-SVM parameters have to be optimized with regard to this measure
60/// \ingroup kerneloptimization
61template<class InputType = RealVector>
62class SvmLogisticInterpretation : public AbstractObjectiveFunction< RealVector, double > {
63public:
66protected:
67 FoldsType m_folds; ///< the underlying partitioned dataset.
68 KernelType *mep_kernel; ///< the kernel with which to run the SVM
69 std::size_t m_nhp; ///< for convenience, the Number of Hyper Parameters
70 std::size_t m_nkp; ///< for convenience, the Number of Kernel Parameters
71 std::size_t m_numFolds; ///< the number of folds to be used in cross-validation
72 std::size_t m_numSamples; ///< overall number of samples in the dataset
73 std::size_t m_inputDims; ///< input dimensionality
74 bool m_svmCIsUnconstrained; ///< the SVM regularization parameter C is passed for unconstrained optimization, and the derivative should compensate for that
75 QpStoppingCondition *mep_svmStoppingCondition; ///< the stopping criterion that is to be passed to the SVM trainer.
76public:
77
78 //! constructor.
79 //! \param folds an already partitioned dataset (i.e., a CVFolds object)
80 //! \param kernel pointer to the kernel to be used within the SVMs.
81 //! \param unconstrained whether or not the C-parameter of/for the C-SVM is passed for unconstrained optimization mode.
82 //! \param stop_cond the stopping conditions which are to be passed to the
84 FoldsType const &folds, KernelType *kernel,
85 bool unconstrained = true, QpStoppingCondition *stop_cond = NULL
86 )
87 : mep_kernel(kernel)
88 , m_nhp(kernel->parameterVector().size()+1)
89 , m_nkp(kernel->parameterVector().size())
90 , m_numFolds(folds.size()) //gets number of folds!
91 , m_numSamples(folds.dataset().numberOfElements())
92 , m_inputDims(inputDimension(folds.dataset()))
93 , m_svmCIsUnconstrained(unconstrained)
94 , mep_svmStoppingCondition(stop_cond)
95 {
96 SHARK_RUNTIME_CHECK(kernel != NULL, "[SvmLogisticInterpretation::SvmLogisticInterpretation] kernel is not allowed to be NULL"); //mtq: necessary despite indirect check via call in initialization list?
97 SHARK_RUNTIME_CHECK(m_numFolds > 1, "[SvmLogisticInterpretation::SvmLogisticInterpretation] please provide a meaningful number of folds for cross validation");
98 if (!m_svmCIsUnconstrained) //mtq: important: we additionally need to deal with kernel feasibility indicators! important!
103 m_folds = folds;
104 }
105
106 /// \brief From INameable: return the class name.
107 std::string name() const
108 { return "SvmLogisticInterpretation"; }
109
110 //! checks whether the search point provided is feasible
111 //! \param input the point to test for feasibility
112 bool isFeasible(const SearchPointType &input) const {
113 SHARK_ASSERT(input.size() == m_nhp);
114 if (input(0) <= 0.0 && !m_svmCIsUnconstrained) {
115 return false;
116 }
117 return true;
118 }
119
120 std::size_t numberOfVariables()const{
121 return m_nhp;
122 }
123
124 //! train a number of SVMs in a cross-validation setting using the hyperparameters passed to this method.
125 //! the output scores from all validations sets are then concatenated. together with the true labels, these
126 //! scores can then be used to fit a sigmoid such that it becomes as good as possible a model for the
127 //! class membership probabilities given the SVM output scores. This method returns the negative likelihood
128 //! of the best fitting sigmoid, given a set of SVM hyperparameters.
129 //! \param parameters the SVM hyperparameters to use for all C-SVMs
130 double eval(SearchPointType const &parameters) const {
131 SHARK_RUNTIME_CHECK(m_nhp == parameters.size(), "[SvmLogisticInterpretation::eval] wrong number of parameters");
132 // initialize, copy parameters
133 double C_reg = (m_svmCIsUnconstrained ? std::exp(parameters(m_nkp)) : parameters(m_nkp)); //set up regularization parameter
134 mep_kernel->setParameterVector(subrange(parameters, 0, m_nkp)); //set up kernel parameters
135 // Stores the stacked CV predictions for every fold.
136 ClassificationDataset validation_dataset;
137 // for each fold, train an svm and get predictions on the validation data
138 for (std::size_t i=0; i<m_numFolds; i++) {
139 // init SVM
141 CSvmTrainer<InputType, double> csvm_trainer(mep_kernel, C_reg, true, m_svmCIsUnconstrained); //the trainer
142 csvm_trainer.sparsify() = false;
143 if (mep_svmStoppingCondition != NULL) {
145 }
146
147 // train SVM on current training fold
148 csvm_trainer.train(svm, m_folds.training(i));
149
150 //append validation predictions
151 validation_dataset.append(transformInputs(m_folds.validation(i),svm.decisionFunction()));
152 }
153
154 // Fit a logistic regression to the prediction
155 LinearModel<> logistic_model = fitLogistic(validation_dataset);
156
157 //to evaluate, we use cross entropy loss on the fitted model
159 return logistic_loss(validation_dataset.labels(),logistic_model(validation_dataset.inputs()));
160 }
161
162 //! the derivative of the error() function above w.r.t. the parameters.
163 //! \param parameters the SVM hyperparameters to use for all C-SVMs
164 //! \param derivative will store the computed derivative w.r.t. the current hyperparameters
165 // mtq: should this also follow the first-call-error()-then-call-deriv() paradigm?
166 double evalDerivative(SearchPointType const &parameters, FirstOrderDerivative &derivative) const {
167 SHARK_RUNTIME_CHECK(m_nhp == parameters.size(), "[SvmLogisticInterpretation::evalDerivative] wrong number of parameters");
168 // initialize, copy parameters
169 double C_reg = (m_svmCIsUnconstrained ? std::exp(parameters(m_nkp)) : parameters(m_nkp)); //set up regularization parameter
170 mep_kernel->setParameterVector(subrange(parameters, 0, m_nkp)); //set up kernel parameters
171 // these two will be filled in order corresp. to all CV validation partitions stacked
172 // behind one another, and then used to create datasets with
173 std::vector< unsigned int > tmp_helper_labels(m_numSamples);
174 std::vector< RealVector > tmp_helper_preds(m_numSamples);
175
176 unsigned int next_label = 0; //helper index counter to monitor the next position to be filled in the above vectors
177 // init variables especially for derivative
178 RealMatrix all_validation_predict_derivs(m_numSamples, m_nhp); //will hold derivatives of all output scores w.r.t. all hyperparameters
179 RealVector der; //temporary helper for derivative calls
180
181 // for each fold, train an svm and get predictions on the validation data
182 for (std::size_t i=0; i<m_numFolds; i++) {
183 // get current train/validation partitions as well as corresponding labels
184 ClassificationDataset cur_train_data = m_folds.training(i);
185 ClassificationDataset cur_valid_data = m_folds.validation(i);
186 std::size_t cur_vsize = cur_valid_data.numberOfElements();
187 Data< unsigned int > cur_vlabels = cur_valid_data.labels(); //validation labels of this fold
188 Data< RealVector > cur_vinputs = cur_valid_data.inputs(); //validation inputs of this fold
189 Data< RealVector > cur_vscores; //will hold SVM output scores for current validation partition
190 // init SVM
191 KernelClassifier<InputType> svm; //the SVM
192 CSvmTrainer<InputType, double> csvm_trainer(mep_kernel, C_reg, true, m_svmCIsUnconstrained); //the trainer
193 csvm_trainer.sparsify() = false;
194 csvm_trainer.setComputeBinaryDerivative(true);
195 if (mep_svmStoppingCondition != NULL) {
197 }
198 // train SVM on current fold
199 csvm_trainer.train(svm, cur_train_data);
200 CSvmDerivative<InputType> svm_deriv(&svm, &csvm_trainer);
201 cur_vscores = svm.decisionFunction()(cur_valid_data.inputs()); //will result in a dataset of RealVector as output
202 // copy the scores and corresponding labels to the dataset-wide storage
203 for (std::size_t j=0; j<cur_vsize; j++) {
204 // copy label and prediction score
205 tmp_helper_labels[next_label] = cur_vlabels.element(j);
206 tmp_helper_preds[next_label] = cur_vscores.element(j);
207 // get and store the derivative of the score w.r.t. the hyperparameters
208 svm_deriv.modelCSvmParameterDerivative(cur_vinputs.element(j), der);
209 noalias(row(all_validation_predict_derivs, next_label)) = der; //fast assignment of the derivative to the correct matrix row
210 ++next_label;
211 }
212 }
213
214 // now we got it all: the predictions across the validation folds, plus the correct corresponding
215 // labels. so we go ahead and fit a logistic regression
216 ClassificationDataset validation_dataset= createLabeledDataFromRange(tmp_helper_preds, tmp_helper_labels);
217 LinearModel<> logistic_model = fitLogistic(validation_dataset);
218
219 // to evaluate, we use cross entropy loss on the fitted model and compute
220 // the derivative wrt the svm model parameters.
221 derivative.resize(m_nhp);
222 derivative.clear();
223 double error = 0;
224 std::size_t start = 0;
225 for(auto const& batch: validation_dataset.batches()){
226 std::size_t end = start+batch.size();
228 RealMatrix lossGradient;
229 error += logistic_loss.evalDerivative(batch.label,logistic_model(batch.input),lossGradient);
230 noalias(derivative) += column(lossGradient,0) % rows(all_validation_predict_derivs,start,end);
231 start = end;
232 }
233 derivative *= logistic_model.parameterVector()(0);
234 derivative /= m_numSamples;
235 return error / m_numSamples;
236 }
237private:
238 LinearModel<> fitLogistic(ClassificationDataset const& data)const{
239 LinearModel<> logistic_model;
240 logistic_model.setStructure(1,1, true);//1 input, 1 output, bias = 2 parameters
242 ErrorFunction<> error(data, &logistic_model, & logistic_loss);
243 BFGS<> optimizer;
244 optimizer.init(error);
245 //this converges after very few iterations (typically 20 function evaluations)
246 while(norm_2(optimizer.derivative())> 1.e-8){
247 double lastValue = optimizer.solution().value;
248 optimizer.step(error);
249 if(lastValue == optimizer.solution().value) break;//we are done due to numerical precision
250 }
251 logistic_model.setParameterVector(optimizer.solution().point);
252 return logistic_model;
253 }
254};
255
256
257}
258#endif