SparseData.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Support for importing and exporting data from and to sparse data (libSVM) formatted data files
6 *
7 *
8 * \par
9 * The most important application of the methods provided in this
10 * file is the import of data from LIBSVM files to Shark Data containers.
11 *
12 *
13 *
14 *
15 * \author M. Tuma, T. Glasmachers, C. Igel
16 * \date 2010-2016
17 *
18 *
19 * \par Copyright 1995-2017 Shark Development Team
20 *
21 * <BR><HR>
22 * This file is part of Shark.
23 * <https://shark-ml.github.io/Shark/>
24 *
25 * Shark is free software: you can redistribute it and/or modify
26 * it under the terms of the GNU Lesser General Public License as published
27 * by the Free Software Foundation, either version 3 of the License, or
28 * (at your option) any later version.
29 *
30 * Shark is distributed in the hope that it will be useful,
31 * but WITHOUT ANY WARRANTY; without even the implied warranty of
32 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 * GNU Lesser General Public License for more details.
34 *
35 * You should have received a copy of the GNU Lesser General Public License
36 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
37 *
38 */
39//===========================================================================
40
41#ifndef SHARK_DATA_SPARSEDATA_H
42#define SHARK_DATA_SPARSEDATA_H
43
46#include <shark/Data/Dataset.h>
47#include <fstream>
48
49namespace shark {
50
51/**
52 * \ingroup shark_globals
53 *
54 * @{
55 */
56
57
58
59/// \brief Import classification data from a sparse data (libSVM) file.
60///
61/// \param dataset container storing the loaded data
62/// \param stream stream to be read from
63/// \param highestIndex highest feature index, or 0 for auto-detection
64/// \param batchSize size of batch
67 std::istream& stream,
68 unsigned int highestIndex = 0,
70);
71
74 std::istream& stream,
75 unsigned int highestIndex = 0,
77);
78
79/// \brief Import regression data from a sparse data (libSVM) file.
80///
81/// \param dataset container storing the loaded data
82/// \param stream stream to be read from
83/// \param highestIndex highest feature index, or 0 for auto-detection
84/// \param batchSize size of batch
87 std::istream& stream,
88 unsigned int highestIndex = 0,
90);
91
94 std::istream& stream,
95 unsigned int highestIndex = 0,
97);
98
99/// \brief Import classification data from a sparse data (libSVM) file.
100///
101/// \param dataset container storing the loaded data
102/// \param stream stream to be read from
103/// \param highestIndex highest feature index, or 0 for auto-detection
104/// \param batchSize size of batch
107 std::istream& stream,
108 unsigned int highestIndex = 0,
110);
113 std::istream& stream,
114 unsigned int highestIndex = 0,
116);
117
118/// \brief Import regression data from a sparse data (libSVM) file.
119///
120/// \param dataset container storing the loaded data
121/// \param stream stream to be read from
122/// \param highestIndex highest feature index, or 0 for auto-detection
123/// \param batchSize size of batch
126 std::istream& stream,
127 unsigned int highestIndex = 0,
129);
132 std::istream& stream,
133 unsigned int highestIndex = 0,
135);
136
137
138/// \brief Import classification data from a sparse data (libSVM) file.
139///
140/// \param dataset container storing the loaded data
141/// \param fn the file to be read from
142/// \param highestIndex highest feature index, or 0 for auto-detection
143/// \param batchSize size of batch
146 std::string fn,
147 unsigned int highestIndex = 0,
149);
152 std::string fn,
153 unsigned int highestIndex = 0,
155);
156
157/// \brief Import regression data from a sparse data (libSVM) file.
158///
159/// \param dataset container storing the loaded data
160/// \param fn the file to be read from
161/// \param highestIndex highest feature index, or 0 for auto-detection
162/// \param batchSize size of batch
165 std::string fn,
166 unsigned int highestIndex = 0,
168);
171 std::string fn,
172 unsigned int highestIndex = 0,
174);
175
176/// \brief Import classification data from a sparse data (libSVM) file.
177///
178/// \param dataset container storing the loaded data
179/// \param fn the file to be read from
180/// \param highestIndex highest feature index, or 0 for auto-detection
181/// \param batchSize size of batch
184 std::string fn,
185 unsigned int highestIndex = 0,
187);
190 std::string fn,
191 unsigned int highestIndex = 0,
193);
194
195/// \brief Import regression data from a sparse data (libSVM) file.
196///
197/// \param dataset container storing the loaded data
198/// \param fn the file to be read from
199/// \param highestIndex highest feature index, or 0 for auto-detection
200/// \param batchSize size of batch
203 std::string fn,
204 unsigned int highestIndex = 0,
206);
209 std::string fn,
210 unsigned int highestIndex = 0,
212);
213
214
215/// \brief Export classification data to sparse data (libSVM) format.
216///
217/// \param dataset Container storing the data
218/// \param stream Output stream
219/// \param oneMinusOne Flag for applying the transformation y<-2y-1 to binary labels
220/// \param sortLabels Flag for sorting data points according to labels
221template<typename InputType>
222void exportSparseData(LabeledData<InputType, unsigned int> const& dataset, std::ostream& stream, bool oneMinusOne = true, bool sortLabels = false)
223{
224 if (numberOfClasses(dataset) != 2) oneMinusOne = false;
225
226 std::vector< KeyValuePair<unsigned int, std::pair<std::size_t, std::size_t> > > order;
227 for (std::size_t b=0; b<dataset.numberOfBatches(); b++)
228 {
229 auto batch = dataset.batch(b);
230 for (std::size_t i=0; i<batchSize(batch); i++)
231 {
232 order.emplace_back(getBatchElement(batch, i).label, std::make_pair(b, i));
233 }
234 }
235 if (sortLabels)
236 {
237 std::sort(order.begin(), order.end());
238 }
239
240 for (auto const& p : order)
241 {
242 auto element = getBatchElement(dataset.batch(p.value.first), p.value.second);
243 // apply transformation to label and write it to file
244 if (oneMinusOne) stream << 2*int(element.label)-1 << " ";
245 //libsvm file format documentation is scarce, but by convention the first class seems to be 1..
246 else stream << element.label+1 << " ";
247 // write input data to file
248 for (auto it = element.input.begin(); it != element.input.end(); ++it)
249 {
250 stream << " " << it.index()+1 << ":" << *it;
251 }
252 stream << std::endl;
253 }
254}
255
256/// \brief Export classification data to sparse data (libSVM) format.
257///
258/// \param dataset Container storing the data
259/// \param fn Output file name
260/// \param oneMinusOne Flag for applying the transformation y<-2y-1 to binary labels
261/// \param sortLabels Flag for sorting data points according to labels
262/// \param append Flag for appending to the output file instead of overwriting it
263template<typename InputType>
264void exportSparseData(LabeledData<InputType, unsigned int> const& dataset, const std::string &fn, bool oneMinusOne = true, bool sortLabels = false, bool append = false)
265{
266 std::ofstream ofs;
267
268 // shall we append only or overwrite?
269 if (append == true) {
270 ofs.open (fn.c_str(), std::fstream::out | std::fstream::app );
271 } else {
272 ofs.open (fn.c_str());
273 }
274 SHARK_RUNTIME_CHECK(ofs, "File can not be opened for writing");
275
276 exportSparseData(dataset, ofs, oneMinusOne, sortLabels);
277}
278
279/// \brief Export regression data to sparse data (libSVM) format.
280///
281/// \param dataset Container storing the data
282/// \param stream Output stream
283template<typename InputType>
284void exportSparseData(LabeledData<InputType, RealVector> const& dataset, std::ostream& stream)
285{
286 for (std::size_t b=0; b<dataset.numberOfBatches(); b++)
287 {
288 auto batch = dataset.batch(b);
289 for (std::size_t i=0; i<batchSize(batch); i++)
290 {
291 auto element = getBatchElement(batch, i);
292 SHARK_ASSERT(element.label.size() == 1);
293 stream << element.label(0);
294 for (auto it = element.input.begin(); it != element.input.end(); ++it)
295 {
296 stream << " " << it.index()+1 << ":" << *it;
297 }
298 stream << std::endl;
299 }
300 }
301}
302
303/// \brief Export regression data to sparse data (libSVM) format.
304///
305/// \param dataset Container storing the data
306/// \param fn Output file
307/// \param append Flag for appending to the output file instead of overwriting it
308template<typename InputType>
309void exportSparseData(LabeledData<InputType, RealVector> const& dataset, const std::string &fn, bool append = false)
310{
311 std::ofstream ofs;
312
313 // shall we append only or overwrite?
314 if (append == true) {
315 ofs.open (fn.c_str(), std::fstream::out | std::fstream::app );
316 } else {
317 ofs.open (fn.c_str());
318 }
319
320 SHARK_RUNTIME_CHECK(ofs, "File can not be opened for writing");
321
322 exportSparseData(dataset, ofs);
323}
324
325/** @}*/
326
327}
328#endif