Csv.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Support for importing and exporting data from and to character separated value (CSV) files
6 *
7 *
8 * \par
9 * The most important application of the methods provided in this
10 * file is the import of data from CSV files into Shark data
11 * containers.
12 *
13 *
14 *
15 *
16 * \author T. Voss, M. Tuma
17 * \date 2010
18 *
19 *
20 * \par Copyright 1995-2017 Shark Development Team
21 *
22 * <BR><HR>
23 * This file is part of Shark.
24 * <https://shark-ml.github.io/Shark/>
25 *
26 * Shark is free software: you can redistribute it and/or modify
27 * it under the terms of the GNU Lesser General Public License as published
28 * by the Free Software Foundation, either version 3 of the License, or
29 * (at your option) any later version.
30 *
31 * Shark is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU Lesser General Public License for more details.
35 *
36 * You should have received a copy of the GNU Lesser General Public License
37 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
38 *
39 */
40//===========================================================================
41
42#ifndef SHARK_DATA_CSV_H
43#define SHARK_DATA_CSV_H
44
46#include <shark/Data/Dataset.h>
47
48#include <fstream>
49#include <string>
50
51namespace shark {
52
53/**
54 * \ingroup shark_globals
55 *
56 * @{
57 */
58
59
60/// \brief Position of the label in a CSV file
61///
62/// \par
63/// This type describes the position of the label in a record of a CSV file.
64/// The label can be positioned either in the first or the last column, or
65/// there can be no label present at all.
70
71namespace detail {
72
73// export function for unlabeled data
74template<typename T, typename Stream>
75void exportCSV(const T &data, // Container that holds the samples
76 Stream &out, // The file to be read from
77 char separator, // The separator between elements
78 bool scientific = true, //scientific notation?
79 unsigned int fieldwidth = 0
80) {
81 SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
82
83 // set output format
84 if (scientific)
85 out.setf(std::ios_base::scientific);
86 std::streamsize ss = out.precision();
87 out.precision(10);
88
89 // write out
90 typename T::const_iterator it = data.begin();
91 for (; it != data.end(); ++it) {
92 SHARK_RUNTIME_CHECK(it->begin() != it->end(), "Record must not be empty");
93 for (std::size_t i=0; i<(*it).size()-1; i++) {
94 out << std::setw(fieldwidth) << (*it)(i) << separator;
95 }
96 out << std::setw(fieldwidth) << (*it)((*it).size()-1) << std::endl;
97 }
98
99 // restore output format
100 out.precision(ss);
101}
102
103 // export function for labeled data
104
105template<typename T, typename U, typename Stream>
106void exportCSV_labeled(const T &input, // Container that holds the samples
107 const U &labels, // Container that holds the labels
108 Stream &out, // The file to be read from
109 LabelPosition lp, // The position of the label
110 char separator, // The separator between elements
111 bool scientific = true, //scientific notation?
112 unsigned int fieldwidth = 0, //column-align using this field width
113 typename boost::enable_if<
114 std::is_arithmetic<typename boost::range_value<U>::type>
115 >::type* dummy = 0//enable this only for arithmetic types
116) {
117 SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
118
119 if (scientific)
120 out.setf(std::ios_base::scientific);
121 std::streamsize ss = out.precision();
122 out.precision(10);
123
124 typename T::const_iterator iti = input.begin();
125 typename U::const_iterator itl = labels.begin();
126
127
128 for (; iti != input.end(); ++iti, ++itl) {
129 SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "Record must not be empty");
130 if (lp == FIRST_COLUMN)
131 out << *itl << separator;
132 for (std::size_t i=0; i<(*iti).size()-1; i++) {
133 out << std::setw(fieldwidth) << (*iti)(i) << separator;
134 }
135 if (lp == FIRST_COLUMN) {
136 out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
137 } else {
138 out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << separator << *itl << std::endl;
139 }
140 }
141 out.precision(ss);
142}
143
144// export function for data with vector labels
145template<typename T, typename U, typename Stream>
146void exportCSV_labeled(
147 const T &input, // Container that holds the samples
148 const U &labels, // Container that holds the labels
149 Stream &out, // The file to be read from
150 LabelPosition lp, // The position of the label
151 char separator, // The separator between elements
152 bool scientific = true, //scientific notation?
153 unsigned int fieldwidth = 0, //column-align using this field width
154 typename boost::disable_if<
155 std::is_arithmetic<typename boost::range_value<U>::type>
156 >::type* dummy = 0//enable this only for complex types
157) {
158 SHARK_RUNTIME_CHECK(out, "Stream cannot be opened for writing.");
159
160
161 if (scientific)
162 out.setf(std::ios_base::scientific);
163 std::streamsize ss = out.precision();
164 out.precision(10);
165
166 typename T::const_iterator iti = input.begin();
167 typename U::const_iterator itl = labels.begin();
168
169 for (; iti != input.end(); ++iti, ++itl) {
170 SHARK_RUNTIME_CHECK(iti->begin() != iti->end(), "[exportCSV (2)] record must not be empty");
171 if (lp == FIRST_COLUMN) {
172 for (std::size_t j = 0; j < itl->size(); j++)
173 out << std::setw(fieldwidth) << (*itl)(j) << separator;
174 }
175 for (std::size_t i=0; i<(*iti).size()-1; i++) {
176 out << std::setw(fieldwidth) << (*iti)(i) << separator;
177 }
178 if (lp == FIRST_COLUMN) {
179 out << std::setw(fieldwidth) << (*iti)((*iti).size()-1) << std::endl;
180 } else {
181 out << std::setw(fieldwidth) << (*iti)((*iti).size()-1);
182 for (std::size_t j = 0; j < itl->size(); j++)
183 out << std::setw(fieldwidth) << separator << (*itl)(j);
184 out << std::endl;
185 }
186 }
187 out.precision(ss);
188}
189} // namespace detail
190
191
192
193// ACTUAL READ IN ROUTINES BELOW
194
195/// \brief Import unlabeled vectors from a read-in character-separated value file.
196///
197/// \param data Container storing the loaded data
198/// \param contents The read in csv-file
199/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
200/// \param comment Trailing character indicating comment line. By dfault it is '#'
201/// \param maximumBatchSize Size of batches in the dataset
203 Data<FloatVector> &data,
204 std::string const& contents,
205 char separator = ',',
206 char comment = '#',
207 std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
208);
209
210/// \brief Import unlabeled vectors from a read-in character-separated value file.
211///
212/// \param data Container storing the loaded data
213/// \param contents The read in csv-file
214/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
215/// \param comment Trailing character indicating comment line. By dfault it is '#'
216/// \param maximumBatchSize Size of batches in the dataset
218 Data<RealVector> &data,
219 std::string const& contents,
220 char separator = ',',
221 char comment = '#',
222 std::size_t maximumBatchSize = Data<RealVector>::DefaultBatchSize
223);
224
225/// \brief Import "csv" from string consisting only of a single unsigned int per row
226///
227/// \param data Container storing the loaded data
228/// \param contents The read in csv-file
229/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
230/// \param comment Trailing characters indicating comment line. By default it is "#"
231/// \param maximumBatchSize Size of batches in the dataset
233 Data<unsigned int> &data,
234 std::string const& contents,
235 char separator = ',',
236 char comment = '#',
237 std::size_t maximumBatchSize = Data<unsigned int>::DefaultBatchSize
238);
239
240/// \brief Import "csv" from string consisting only of a single int per row
241///
242/// \param data Container storing the loaded data
243/// \param contents The read in csv-file
244/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
245/// \param comment Trailing characters indicating comment line. By default it is "#"
246/// \param maximumBatchSize Size of batches in the dataset
248 Data<int> &data,
249 std::string const& contents,
250 char separator = ',',
251 char comment = '#',
252 std::size_t maximumBatchSize = Data<int>::DefaultBatchSize
253);
254
255/// \brief Import "csv" from string consisting only of a single double per row
256///
257/// \param data Container storing the loaded data
258/// \param contents The read in csv-file
259/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
260/// \param comment Trailing characters indicating comment line. By default it is "#"
261/// \param maximumBatchSize Size of batches in the dataset
263 Data<float> &data,
264 std::string const& contents,
265 char separator = ',',
266 char comment = '#',
267 std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
268);
269
270/// \brief Import "csv" from string consisting only of a single double per row
271///
272/// \param data Container storing the loaded data
273/// \param contents The read in csv-file
274/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
275/// \param comment Trailing characters indicating comment line. By default it is "#"
276/// \param maximumBatchSize Size of batches in the dataset
278 Data<double> &data,
279 std::string const& contents,
280 char separator = ',',
281 char comment = '#',
282 std::size_t maximumBatchSize = Data<double>::DefaultBatchSize
283);
284
285/// \brief Import labeled data from a character-separated value file.
286///
287/// \param dataset Container storing the loaded data
288/// \param contents the read-in file contents.
289/// \param lp Position of the label in the record, either first or last column
290/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
291/// \param comment Character for indicating a comment, by default '#'
292/// \param maximumBatchSize maximum size of a batch in the dataset after import
295 std::string const& contents,
296 LabelPosition lp,
297 char separator = ',',
298 char comment = '#',
300);
301
302/// \brief Import labeled data from a character-separated value file.
303///
304/// \param dataset Container storing the loaded data
305/// \param contents the read-in file contents.
306/// \param lp Position of the label in the record, either first or last column
307/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
308/// \param comment Character for indicating a comment, by default '#'
309/// \param maximumBatchSize maximum size of a batch in the dataset after import
312 std::string const& contents,
313 LabelPosition lp,
314 char separator = ',',
315 char comment = '#',
317);
318
319
320/// \brief Import regression data from a read-in character-separated value file.
321///
322/// \param dataset Container storing the loaded data
323/// \param contents The read in csv-file
324/// \param lp Position of the label in the record, either first or last column
325/// \param separator Separator between entries, typically a comma or a space
326/// \param comment Character for indicating a comment, by default empty
327/// \param numberOfOutputs Dimensionality of label/output
328/// \param maximumBatchSize maximum size of a batch in the dataset after import
331 std::string const& contents,
332 LabelPosition lp,
333 std::size_t numberOfOutputs = 1,
334 char separator = ',',
335 char comment = '#',
336 std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
337);
338
339/// \brief Import regression data from a read-in character-separated value file.
340///
341/// \param dataset Container storing the loaded data
342/// \param contents The read in csv-file
343/// \param lp Position of the label in the record, either first or last column
344/// \param separator Separator between entries, typically a comma or a space
345/// \param comment Character for indicating a comment, by default empty
346/// \param numberOfOutputs Dimensionality of label/output
347/// \param maximumBatchSize maximum size of a batch in the dataset after import
350 std::string const& contents,
351 LabelPosition lp,
352 std::size_t numberOfOutputs = 1,
353 char separator = ',',
354 char comment = '#',
355 std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
356);
357
358
359
360/// \brief Import a Dataset from a csv file
361///
362/// \param data Container storing the loaded data
363/// \param fn The file to be read from
364/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
365/// \param comment Trailing character indicating comment line. By dfault it is '#'
366/// \param maximumBatchSize Size of batches in the dataset
367/// \param titleLines Specifies a number of lines to be skipped in the beginning of the file
368template<class T>
370 Data<T>& data,
371 std::string fn,
372 char separator = ',',
373 char comment = '#',
374 std::size_t maximumBatchSize = Data<T>::DefaultBatchSize,
375 std::size_t titleLines = 0
376){
377 std::ifstream stream(fn.c_str());
378 SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
379
380 stream.unsetf(std::ios::skipws);
381
382 for(std::size_t i=0; i < titleLines; ++i) // ignoring the first lines
383 stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
384
385 std::istream_iterator<char> streamBegin(stream);
386 std::string contents(//read contents of file in string
387 streamBegin,
388 std::istream_iterator<char>()
389 );
390 //call the actual parser
391 csvStringToData(data,contents,separator,comment,maximumBatchSize);
392}
393
394/// \brief Import a labeled Dataset from a csv file
395///
396/// \param data Container storing the loaded data
397/// \param fn The file to be read from
398/// \param lp Position of the label in the record, either first or last column
399/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
400/// \param comment Trailing character indicating comment line. By dfault it is '#'
401/// \param maximumBatchSize Size of batches in the dataset
402template<class T>
404 LabeledData<blas::vector<T>, unsigned int>& data,
405 std::string fn,
406 LabelPosition lp,
407 char separator = ',',
408 char comment = '#',
410){
411 std::ifstream stream(fn.c_str());
412 SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
413
414 stream.unsetf(std::ios::skipws);
415 std::istream_iterator<char> streamBegin(stream);
416 std::string contents(//read contents of file in string
417 streamBegin,
418 std::istream_iterator<char>()
419 );
420 //call the actual parser
421 csvStringToData(data,contents,lp,separator,comment,maximumBatchSize);
422}
423
424/// \brief Import a labeled Dataset from a csv file
425///
426/// \param data Container storing the loaded data
427/// \param fn The file to be read from
428/// \param lp Position of the label in the record, either first or last column
429/// \param numberOfOutputs dimensionality of the labels
430/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
431/// \param comment Trailing character indicating comment line. By dfault it is '#'
432/// \param maximumBatchSize Size of batches in the dataset
433template<class T>
435 LabeledData<blas::vector<T>, blas::vector<T> >& data,
436 std::string fn,
437 LabelPosition lp,
438 std::size_t numberOfOutputs = 1,
439 char separator = ',',
440 char comment = '#',
441 std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
442){
443 std::ifstream stream(fn.c_str());
444 SHARK_RUNTIME_CHECK(stream, "Stream cannot be opened for reading.");
445
446 stream.unsetf(std::ios::skipws);
447 std::istream_iterator<char> streamBegin(stream);
448 std::string contents(//read contents of file in string
449 streamBegin,
450 std::istream_iterator<char>()
451 );
452 //call the actual parser
453 csvStringToData(data,contents,lp, numberOfOutputs, separator,comment,maximumBatchSize);
454}
455
456/// \brief Format unlabeled data into a character-separated value file.
457///
458/// \param set Container to be exported
459/// \param fn The file to be written to
460/// \param separator Separator between entries, typically a comma or a space
461/// \param sci should the output be in scientific notation?
462/// \param width argument to std::setw when writing the output
463template<typename Type>
465 Data<Type> const& set,
466 std::string fn,
467 char separator = ',',
468 bool sci = true,
469 unsigned int width = 0
470) {
471 std::ofstream ofs(fn.c_str());
472 SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
473 detail::exportCSV(set.elements(), ofs, separator, sci, width);
474}
475
476
477/// \brief Format labeled data into a character-separated value file.
478///
479/// \param dataset Container to be exported
480/// \param fn The file to be written to
481/// \param lp Position of the label in the record, either first or last column
482/// \param separator Separator between entries, typically a comma or a space
483/// \param sci should the output be in scientific notation?
484/// \param width argument to std::setw when writing the output
485template<typename InputType, typename LabelType>
488 std::string fn,
489 LabelPosition lp,
490 char separator = ',',
491 bool sci = true,
492 unsigned int width = 0
493) {
494 std::ofstream ofs(fn.c_str());
495 SHARK_RUNTIME_CHECK(ofs, "Stream cannot be opened for writing.");
496 detail::exportCSV_labeled(dataset.inputs().elements(), dataset.labels().elements(), ofs, lp, separator, sci, width);
497}
498
499
500/** @}*/
501
502} // namespace shark
503#endif // SHARK_ML_CSV_H