ExportKernelMatrix.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief export precomputed kernel matrices (using libsvm format)
6 *
7 *
8 *
9 * \author M. Tuma
10 * \date 2012
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35#ifndef SHARK_DATA_PRECOMPUTEDMATRIX_H
36#define SHARK_DATA_PRECOMPUTEDMATRIX_H
37
38
39
40#include <fstream>
41#include <shark/Data/Dataset.h>
42#include <shark/Data/DataView.h>
46
47
48namespace shark
49{
50
51/**
52 * \ingroup shark_globals
53 *
54 * @{
55 */
56
58{
59 NONE, // no normalization. output regular Gram kernel matrix
60 MULTIPLICATIVE_TRACE_ONE, // determine the trace, and devide each entry by it
61 MULTIPLICATIVE_TRACE_N, // determine the trace, devide each entry by it, then multiply by the number of samples
62 MULTIPLICATIVE_VARIANCE_ONE, // normalize to unit variance in feature space. see kloft in jmlr 2012.
63 CENTER_ONLY, // center the kernel in feature space. see cortes in jmlr 2012 and in icml 2010.
64 CENTER_AND_MULTIPLICATIVE_TRACE_ONE // first center the kernel in featrue space. then devide each entry by the centered kernel's trace.
65};
66
67/// \brief Write a kernel Gram matrix to stream.
68///
69/// \param dataset data basis for the Gram matrix
70/// \param kernel pointer to kernel function to be used
71/// \param out The stream to be written to
72/// \param normalizer what kind of normalization to apply. see enum declaration for details.
73/// \param scientific should the output be in scientific notation?
74/// \param fieldwidth field width for pretty printing
75template<typename InputType, typename LabelType>
78 AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
79 std::ostream &out, // The stream to be written to
80 KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
81 bool scientific = false, // scientific notation?
82 unsigned int fieldwidth = 0 // for pretty-printing
83)
84{
85 //get access to the range of elements
86 DataView<LabeledData<InputType, LabelType> const> points(dataset);
87 std::size_t size = points.size();
88
89 SIZE_CHECK(size != 0);
90 // check outstream status
91 if(!out)
92 {
93 throw(std::invalid_argument("[export_kernel_matrix] Can't write to stream."));
94 }
95
96 // COMPUTE MODIFIERS
97
98 // if multiplicative trace normalization: determine trace
99 double trace = 0.0;
100 double trace_factor = 1.0;
101 if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
102 {
103 for(auto point: points)
104 {
105 trace += kernel.eval(point.input, point.input);
106 }
107 SHARK_ASSERT(trace > 0);
108 trace_factor = 1.0 / trace;
109 if(normalizer == MULTIPLICATIVE_TRACE_N)
110 {
111 trace_factor *= size;
112 }
113 }
114
115 // if multiplicative variance normalization: determine factor
116 double variance_factor = 0.0;
117 if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
118 {
119 ScaledKernel<InputType> scaled(&kernel);
121 normalizer.train(scaled, dataset.inputs());
122 variance_factor = scaled.factor();
123 }
124
125 // if centering: determine matrix- and row-wise means;
126 double mean = 0;
127 RealVector rowmeans(size, 0.0);
128 if(normalizer == CENTER_ONLY || normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
129 {
130 // initialization: calculate mean and rowmeans
131 for(std::size_t i = 0; i < size; i++)
132 {
133 double k = kernel.eval(points[i].input, points[i].input);
134 mean += k; //add diagonal value to mean once
135 rowmeans(i) += k; //add diagonal to its rowmean
136 for(std::size_t j = 0; j < i; j++)
137 {
138 double k = kernel.eval(points[i].input, points[j].input);
139 mean += 2.0 * k; //add off-diagonals to mean twice
140 rowmeans(i) += k; //add to mean of row
141 rowmeans(j) += k; //add to mean of transposed row
142 }
143 }
144 mean = mean / (double) size / (double) size;
145 rowmeans /= size;
146 // get trace if necessary
148 {
149 trace = 0.0;
150 for(std::size_t i = 0; i < size; i++)
151 {
152 trace += kernel.eval(points[i].input, points[i].input) - 2 * rowmeans(i) + mean;
153 }
154 SHARK_ASSERT(trace > 0);
155 trace_factor = 1.0 / trace;
156 }
157 }
158
159 // FIX OUTPUT FORMAT
160
161 // set output format
162 if(scientific)
163 out.setf(std::ios_base::scientific);
164 std::streamsize ss = out.precision();
165 out.precision(10);
166
167 // determine dataset type
168 double max_label = -1e100;
169 double min_label = -max_label;
170 bool binary = false;
171 bool regression = false;
172 for(double cur_label: dataset.labels().elements())
173 {
174 if(cur_label > max_label)
175 max_label = cur_label;
176 if(cur_label < min_label)
177 min_label = cur_label;
178 if((cur_label != (int)cur_label) || cur_label < 0)
179 regression = true;
180 }
181 if(!regression && (min_label == 0) && (max_label == 1))
182 binary = true;
183
184 // WRITE OUT
185
186 // write to file:
187 // loop through examples (rows)
188 for(std::size_t i = 0; i < size; i++)
189 {
190
191 // write label
192 if(regression)
193 {
194 out << std::setw(fieldwidth) << std::left << points[i].label << " ";
195 }
196 else if(binary)
197 {
198 out << std::setw(fieldwidth) << std::left << (int)(points[i].label * 2 - 1) << " ";
199 }
200 else
201 {
202 out << std::setw(fieldwidth) << std::left << (unsigned int)(points[i].label + 1) << " ";
203 }
204
205 out << "0:" << std::setw(fieldwidth) << std::left << i + 1; //write index
206
207 // loop through examples (columns)
208 // CASE DISTINCTION:
209 if(normalizer == NONE)
210 {
211 for(std::size_t j = 0; j < size; j++)
212 {
213 out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << kernel.eval(points[i].input, points[j].input);
214 }
215 out << "\n";
216 }
217 else if(normalizer == MULTIPLICATIVE_TRACE_ONE || normalizer == MULTIPLICATIVE_TRACE_N)
218 {
219 for(std::size_t j = 0; j < size; j++)
220 {
221 out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor * kernel.eval(points[i].input, points[j].input);
222 }
223 out << "\n";
224 }
225 else if(normalizer == MULTIPLICATIVE_VARIANCE_ONE)
226 {
227 for(std::size_t j = 0; j < size; j++)
228 {
229 out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << variance_factor *kernel.eval(points[i].input, points[j].input);
230 }
231 out << "\n";
232 }
233 else if(normalizer == CENTER_ONLY)
234 {
235 for(std::size_t j = 0; j < size; j++)
236 {
237 double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
238 out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << tmp;
239 }
240 out << "\n";
241 }
242 else if(normalizer == CENTER_AND_MULTIPLICATIVE_TRACE_ONE)
243 {
244 for(std::size_t j = 0; j < size; j++)
245 {
246 double tmp = kernel.eval(points[i].input, points[j].input) - rowmeans(i) - rowmeans(j) + mean;
247 out << " " << j + 1 << ":" << std::setw(fieldwidth) << std::left << trace_factor *tmp;
248 }
249 out << "\n";
250 }
251 else
252 {
253 throw SHARKEXCEPTION("[detail::export_kernel_matrix] Unknown normalization type.");
254 }
255
256 }
257
258 // clean up
259 out.precision(ss);
260}
261
262
263
264/// \brief Write a kernel Gram matrix to file.
265///
266/// \param dataset data basis for the Gram matrix
267/// \param kernel pointer to kernel function to be used
268/// \param fn The filename of the file to be written to
269/// \param normalizer what kind of normalization to apply. see enum declaration for details.
270/// \param sci should the output be in scientific notation?
271/// \param width field width for pretty printing
272template<typename InputType, typename LabelType>
276 std::string fn,
278 bool sci = false,
279 unsigned int width = 0
280)
281{
282 std::ofstream ofs(fn.c_str());
283 if(ofs)
284 {
285 exportKernelMatrix(dataset, kernel, ofs, normalizer, sci, width);
286 }
287 else
288 throw(std::invalid_argument("[detail::export_kernel_matrix] Stream cannot be opened for writing."));
289
290}
291
292
293
294
295// deprecated wrapper
296template<typename InputType, typename LabelType>
299 AbstractKernelFunction<InputType> &kernel, // kernel function (can't be const b/c of ScaledKernel later)
300 std::ostream &out, // The stream to be written to
301 KernelMatrixNormalizationType normalizer = NONE, // what kind of normalization to apply. see enum declaration for details.
302 bool scientific = false, // scientific notation?
303 unsigned int fieldwidth = 0 // for pretty-printing
304)
305{
306 exportKernelMatrix(dataset, kernel, out, normalizer, scientific, fieldwidth);
307}
308
309
310// deprecated wrapper
311template<typename InputType, typename LabelType>
315 std::string fn,
317 bool sci = false,
318 unsigned int width = 0
319)
320{
321 exportKernelMatrix(dataset, kernel, fn, normalizer, sci, width);
322}
323
324
325
326// TODO: import functionality is still missing.
327// when that is done, add tutorial
328
329
330/** @}*/
331
332} // namespace shark
333
334
335
336#endif // SHARK_DATA_PRECOMPUTEDMATRIX_H