CVDatasetTools.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Tools for cross-validation
6 *
7 *
8 *
9 * \author O.Krause
10 * \date 2010-2012
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35#ifndef SHARK_DATA_CVDATASETTOOLS_H
36#define SHARK_DATA_CVDATASETTOOLS_H
37
38#include <shark/Data/Dataset.h>
39#include <shark/Core/Random.h>
40#include <algorithm>
41#include <shark/Data/DataView.h>
42
43#include <utility> //for std::pair
44
45namespace shark {
46
47
48template<class DatasetTypeT>
49class CVFolds {
50public:
51 typedef DatasetTypeT DatasetType;
52 typedef typename DatasetType::IndexSet IndexSet;
53
54 /// \brief Creates an empty set of folds.
56 ///\brief partitions set in validation folds indicated by the second argument.
57 ///
58 /// The folds are given as the batch indices of the validation sets
60 DatasetType const &set,
61 std::vector<IndexSet> const &validationIndizes
62 ) : m_dataset(set),m_validationFolds(validationIndizes) {}
63
65 DatasetType const &set,
66 std::vector<std::size_t> const &foldStart
67 ) : m_dataset(set){
68 for (std::size_t partition = 0; partition != foldStart.size(); partition++) {
69 std::size_t partitionSize = (partition+1 == foldStart.size()) ? set.numberOfBatches() : foldStart[partition+1];
70 partitionSize -= foldStart[partition];
71 //create the set with the indices of the validation set of the current partition
72 //also update the starting element
73 IndexSet validationIndizes(partitionSize);
74 for (std::size_t batch = 0; batch != partitionSize; ++batch) {
75 validationIndizes[batch]=batch+foldStart[partition];
76 }
77 m_validationFolds.push_back(validationIndizes);
78 }
79 }
80
81 DatasetType training(std::size_t i) const {
82 SIZE_CHECK(i < size());
83 return m_dataset.indexedSubset(trainingFoldIndices(i));
84 }
85 DatasetType validation(std::size_t i) const {
86 SIZE_CHECK(i < size());
87 return m_dataset.indexedSubset(validationFoldIndices(i));
88 }
89
90 ///\brief returns the indices that make up the i-th validation fold
91 IndexSet const &validationFoldIndices(std::size_t i)const {
92 SIZE_CHECK(i < size());
93 return m_validationFolds[i];
94 }
95
96 IndexSet trainingFoldIndices(std::size_t i)const {
97 SIZE_CHECK(i < size());
98 IndexSet trainingFold;
99 detail::complement(m_validationFolds[i], m_dataset.numberOfBatches(), trainingFold);
100 return trainingFold;
101 }
102
103 ///\brief Returns the number of folds of the dataset.
104 std::size_t size()const {
105 return m_validationFolds.size();
106 }
107
108 /// \brief Returns the dataset underying the folds
109 DatasetType const& dataset()const{
110 return m_dataset;
111 }
112
113 /// \brief Returns the dataset underying the folds
115 return m_dataset;
116 }
117
118private:
119 DatasetType m_dataset;
120 std::vector<IndexSet> m_validationFolds;
121 std::size_t m_datasetSize;
122 std::vector<std::size_t> m_validationFoldSizes;
123};
124
125
126/// auxiliary typedef for createCVSameSizeBalanced and createCVFullyIndexed, stores location index in the first and partition index in the second
127typedef std::pair< std::vector<std::size_t> , std::vector<std::size_t> > RecreationIndices;
128
129namespace detail {
130
131///\brief Version of createCVSameSizeBalanced which works regardless of the label type
132///
133/// Instead of a class label to interpret, this class uses a membership vector for every
134/// class which members[k][i] returns the positon of the i-th member of class k in the set.
135template<class I, class L>
136CVFolds<LabeledData<I,L> > createCVSameSizeBalanced(
137 LabeledData<I,L> &set,
138 std::size_t numberOfPartitions,
139 std::vector< std::vector<std::size_t> > members,
140 std::size_t batchSize,
141 RecreationIndices * cv_indices = NULL //if not NULL: the first vector stores location information, and
142 // the second the partition information. The i-th value of the
143 // first vector shows what the original position of the now i-th
144 // sample was. The i-th value of the second vector shows what
145 // partition that sample now belongs to.
146) {
147 std::size_t numInputs = set.numberOfElements();
148 std::size_t numClasses = members.size();
149
150 //shuffle elements in members
151 for (std::size_t c = 0; c != numClasses; c++) {
152 std::shuffle(members[c].begin(), members[c].end(), random::globalRng);
153 }
154
155 //calculate number of elements per validation subset in the new to construct container
156 std::size_t nn = numInputs / numberOfPartitions;
157 std::size_t leftOver = numInputs % numberOfPartitions;
158 std::vector<std::size_t> validationSize(numberOfPartitions,nn);
159 for (std::size_t partition = 0; partition != leftOver; partition++) {
160 validationSize[partition]++;
161 }
162
163 //calculate the size of the batches for every validation part
164 std::vector<std::size_t> partitionStart;
165 std::vector<std::size_t> batchSizes;
166 std::size_t numBatches = batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
167
168
169 LabeledData<I,L> newSet(numBatches);//set of empty batches
170 DataView<LabeledData<I,L> > setView(set);//fast access to single elements of the original set
171 std::vector<std::size_t> validationSetStart = partitionStart;//current index for the batch of every fold
172 //partition classes into the validation subsets of newSet
173 std::size_t fold = 0;//current fold
174 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
175
176 //initialize the list of position indices which can later be used to re-create the fold (via createCV(Fully)Indexed)
177 if ( cv_indices != NULL ) {
178 cv_indices->first.clear();
179 cv_indices->first.resize( numInputs );
180 cv_indices->second.clear();
181 cv_indices->second.resize( numInputs );
182 }
183
184 size_t j = 0; //for recreation indices
185 for (std::size_t c = 0; c != numClasses; c++) {
186 for (std::size_t i = 0; i != members[c].size(); i++) {
187 std::size_t oldPos = members[c][i];
188 std::size_t batchNumber = validationSetStart[fold];
189
190 batchElements[fold].push_back(oldPos);
191
192 if ( cv_indices != NULL ) {
193 cv_indices->first[ j ] = oldPos; //store the position in which the (now) i-th sample previously resided
194 cv_indices->second[ j ] = fold; //store the partition to which the (now) i-th sample gets assigned
195 // old: //(*cv_indices)[ oldPos ] = fold; //store in vector to recreate partition if desired
196 }
197
198 //if all elements for the current batch are found, create it
199 if (batchElements[fold].size() == batchSizes[batchNumber]) {
200 newSet.batch(validationSetStart[fold]) = subBatch(setView,batchElements[fold]);
201 batchElements[fold].clear();
202 ++validationSetStart[fold];
203 }
204
205 fold = (fold+1) % numberOfPartitions;
206
207 j++;
208 }
209 }
210 SHARK_ASSERT( j == numInputs );
211
212 //swap old and new set
213 swap(set, newSet);
214
215 //create folds
216 return CVFolds<LabeledData<I,L> >(set,partitionStart);
217
218}
219}//namespace detail
220
221/**
222 * \ingroup shark_globals
223 *
224 * @{
225 */
226
227//! \brief Create a partition for cross validation
228//!
229//! The subset each training examples belongs to
230//! is drawn independently and uniformly distributed.
231//! For every partition, all but one subset form the
232//! training set, while the remaining one is used for
233//! validation. The partitions can be accessed using
234//! getCVPartitionName
235//!
236//! \param set the input data for which the new partitions are created
237//! \param numberOfPartitions number of partitions to create
238//! \param batchSize maximum batch size
239template<class I,class L>
241 std::size_t numberOfPartitions,
243 std::vector<std::size_t> indices(set.numberOfElements());
244 for (std::size_t i=0; i != set.numberOfElements(); i++)
245 indices[i] = random::discrete(random::globalRng, std::size_t(0), numberOfPartitions - 1);
246 return createCVIndexed(set,numberOfPartitions,indices,batchSize);
247}
248
249//! \brief Create a partition for cross validation
250//!
251//! Every subset contains (approximately) the same
252//! number of elements. For every partition, all
253//! but one subset form the training set, while the
254//! remaining one is used for validation. The partitions
255//! can be accessed using getCVPartitionName
256//!
257//! \param numberOfPartitions number of partitions to create
258//! \param set the input data from which to draw the partitions
259//! \param batchSize maximum batch size
260template<class I,class L>
262 std::size_t numInputs = set.numberOfElements();
263
264 //calculate the number of validation examples for every partition
265 std::vector<std::size_t> validationSize(numberOfPartitions);
266 std::size_t inputsForValidation = numInputs / numberOfPartitions;
267 std::size_t leftOver = numInputs - inputsForValidation * numberOfPartitions;
268 for (std::size_t i = 0; i != numberOfPartitions; i++) {
269 std::size_t vs=inputsForValidation+(i<leftOver);
270 validationSize[i] =vs;
271 }
272
273 //calculate the size of batches for every validation part and their total number
274 std::vector<std::size_t> partitionStart;
275 std::vector<std::size_t> batchSizes;
276 detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
277
278 set.repartition(batchSizes);
279 set.shuffle();
280
281 CVFolds<LabeledData<I,L> > folds(set,partitionStart);
282 return folds;//set;
283}
284
285
286//! \brief Create a partition for cross validation
287//!
288//! Every subset contains (approximately) the same
289//! number of elements. For every partition, all
290//! but one subset form the training set, while the
291//! remaining one is used for validation.
292//!
293//! \param numberOfPartitions number of partitions to create
294//! \param set the input data from which to draw the partitions
295//! \param batchSize maximum batch size
296//! \param cv_indices if not NULL [default]: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
297template<class I>
300 std::size_t numberOfPartitions,
302 RecreationIndices * cv_indices = NULL //if not NULL: for each element, store the fold it is assigned to; this can be used to later/externally recreate the fold via createCVIndexed
303){
305 std::size_t numInputs = setView.size();
306 std::size_t numClasses = numberOfClasses(set);
307
308
309 //find members of each class
310 std::vector< std::vector<std::size_t> > members(numClasses);
311 for (std::size_t i = 0; i != numInputs; i++) {
312 members[setView[i].label].push_back(i);
313 }
314 return detail::createCVSameSizeBalanced(set, numberOfPartitions, members, batchSize, cv_indices);
315
316}
317
318//! \brief Create a partition for cross validation without changing the dataset
319//!
320//! This method behaves similar to createCVIID
321//! with the difference that batches are not reordered. Thus the batches
322//! are only rearranged randomly in folds, but the dataset itself is not changed.
323//!
324//! \param numberOfPartitions number of partitions to create
325//! \param set the input data from which to draw the partitions
326template<class I, class L>
328 LabeledData<I,L> const& set,
329 std::size_t numberOfPartitions
330){
331 std::vector<std::size_t> indizes(set.numberOfBatches());
332 for(std::size_t i= 0; i != set.numberOfBatches(); ++i)
333 indizes[i] = i;
334 shark::shuffle(indizes.begin(),indizes.end(), random::globalRng);
335
336 typedef typename LabeledData<I,L>::IndexSet IndexSet;
337
338 std::vector<IndexSet> folds;
339 std::size_t partitionSize = set.numberOfBatches()/numberOfPartitions;
340 std::size_t remainder = set.numberOfBatches() - partitionSize*numberOfPartitions;
341 std::vector<std::size_t>::iterator pos = indizes.begin();
342 for(std::size_t i = 0; i!= numberOfPartitions; ++i){
343 std::size_t size = partitionSize;
344 if(remainder> 0){
345 ++size;
346 --remainder;
347 }
348 folds.push_back(IndexSet(pos,pos+size));
349 pos+=size;
350 }
351 return CVFolds<LabeledData<I,L> >(set,folds);
352}
353
354//! \brief Create a partition for cross validation from indices
355//!
356//! Create a partition from indices. The indices vector for each sample states of what
357//! validation partition that sample should become a member. In other words, the index
358//! maps a sample to a validation partition, meaning that it will become a part of the
359//! training partition for all other folds.
360//!
361//! \param set partitions will be subsets of this set
362//! \param numberOfPartitions number of partitions to create
363//! \param indices partition indices of the examples in [0, ..., numberOfPartitions[.
364//! \param batchSize maximum batch size
365template<class I,class L>
367 LabeledData<I,L> &set,
368 std::size_t numberOfPartitions,
369 std::vector<std::size_t> indices,
371) {
372 std::size_t numInputs = set.numberOfElements();
373 SIZE_CHECK(indices.size() == numInputs);
374 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.begin(),indices.end())+1);
375
376 //calculate the size of validation partitions
377 std::vector<std::size_t> validationSize(numberOfPartitions,0);
378 for (std::size_t input = 0; input != numInputs; input++) {
379 validationSize[indices[input]]++;
380 }
381
382 //calculate the size of batches for every validation part and their total number
383 std::vector<std::size_t> partitionStart;
384 std::vector<std::size_t> batchSizes;
385 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
386
387 //construct a new set with the correct batch format from the old set
388 LabeledData<I,L> newSet(numBatches);
389 DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
390 std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
391 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
392 for (std::size_t input = 0; input != numInputs; input++) {
393 std::size_t partition = indices[input];
394 batchElements[partition].push_back(input);
395
396 //if all elements for the current batch are found, create it
397 std::size_t batchNumber = validationSetStart[partition];
398 if (batchElements[partition].size() == batchSizes[batchNumber]) {
399 newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
400 batchElements[partition].clear();
401 ++validationSetStart[partition];
402 }
403 }
404 swap(set, newSet);
405 //now we only need to create the subset itself
406 return CVFolds<LabeledData<I,L> >(set,partitionStart);
407}
408
409
410
411//! \brief Create a partition for cross validation from indices for both ordering and partitioning.
412//!
413//! Create a partition from indices. There is one index vector assigning an order
414//! to the samples, and another one assigning each sample to a validation partition.
415//! That is, given a dataset set, and at the i-th processing step, this function puts
416//! the order_indices[i]-th sample into the partition_indices[i]-th partition. The
417//! order_indices part of the above procedure matters if both an inner and
418//! outer partition are to be recreated: for the inner partition to be recreated, too,
419//! the outer partition must be recreated in the same order, not just partitioned into
420//! the same splits.
421//!
422//! \param set partitions will be subsets of this set
423//! \param numberOfPartitions number of partitions to create
424//! \param indices stores location index in the first and partition index in the second vector
425//! \param batchSize maximum batch size
426template<class I,class L>
428 LabeledData<I,L> &set,
429 std::size_t numberOfPartitions,
430 RecreationIndices indices,
432) {
433 std::size_t numInputs = set.numberOfElements();
434 SIZE_CHECK(indices.first.size() == numInputs);
435 SIZE_CHECK(indices.second.size() == numInputs);
436 SIZE_CHECK(numberOfPartitions == *std::max_element(indices.second.begin(),indices.second.end())+1);
437
438 //calculate the size of validation partitions
439 std::vector<std::size_t> validationSize(numberOfPartitions,0);
440 for (std::size_t input = 0; input != numInputs; input++) {
441 validationSize[indices.second[input]]++;
442 }
443
444 //calculate the size of batches for every validation part and their total number
445 std::vector<std::size_t> partitionStart;
446 std::vector<std::size_t> batchSizes;
447 std::size_t numBatches = detail::batchPartitioning(validationSize,partitionStart,batchSizes,batchSize);
448
449 //construct a new set with the correct batch format from the old set
450 LabeledData<I,L> newSet(numBatches);
451 DataView<LabeledData<I,L> > setView(set); //fast access to single elements of the original set
452 std::vector<std::size_t> validationSetStart = partitionStart; //current index for the batch of every partition
453 std::vector<std::vector<std::size_t> > batchElements(numberOfPartitions);
454 for (std::size_t input = 0; input != numInputs; input++) {
455 std::size_t partition = indices.second[input]; //the second vector's contents indicate the partition to assign each sample to.
456 batchElements[partition].push_back( indices.first[input] ); //the first vector's contents indicate from what original position to get the next sample.
457
458 //if all elements for the current batch are found, create it
459 std::size_t batchNumber = validationSetStart[partition];
460 if (batchElements[partition].size() == batchSizes[batchNumber]) {
461 newSet.batch(validationSetStart[partition]) = subBatch(setView,batchElements[partition]);
462 batchElements[partition].clear();
463 ++validationSetStart[partition];
464 }
465 }
466 swap(set, newSet);
467 //now we only need to create the subset itself
468 return CVFolds<LabeledData<I,L> >(set,partitionStart);
469}
470
471
472// much more to come...
473
474/** @}*/
475}
476#endif