WeightedDataset.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Weighted data sets for (un-)supervised learning.
6 *
7 *
8 * \par
9 * This file provides containers for data used by the models, loss
10 * functions, and learning algorithms (trainers). The reason for
11 * dedicated containers of this type is that data often need to be
12 * split into subsets, such as training and test data, or folds in
13 * cross-validation. The containers in this file provide memory
14 * efficient mechanisms for managing and providing such subsets.
15 * The speciality of these containers are that they are weighted.
16 *
17 *
18 *
19 * \author O. Krause
20 * \date 2014
21 *
22 *
23 * \par Copyright 1995-2017 Shark Development Team
24 *
25 * <BR><HR>
26 * This file is part of Shark.
27 * <https://shark-ml.github.io/Shark/>
28 *
29 * Shark is free software: you can redistribute it and/or modify
30 * it under the terms of the GNU Lesser General Public License as published
31 * by the Free Software Foundation, either version 3 of the License, or
32 * (at your option) any later version.
33 *
34 * Shark is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37 * GNU Lesser General Public License for more details.
38 *
39 * You should have received a copy of the GNU Lesser General Public License
40 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41 *
42 */
43//===========================================================================
44
45#ifndef SHARK_DATA_WEIGHTED_DATASET_H
46#define SHARK_DATA_WEIGHTED_DATASET_H
47
48#include <shark/Data/Dataset.h>
49namespace shark {
50
51///\brief Input-Label pair of data
52template<class DataType, class WeightType>
54 DataType data;
55 WeightType weight;
56
58
59 template<class DataT, class WeightT>
61 DataT&& data,
62 WeightT&& weight
64
65 template<class DataT, class WeightT>
69
70 template<class DataT, class WeightT>
72 data = batch.data;
73 weight = batch.weight;
74 return *this;
75 }
77 data = batch.data;
78 weight = batch.weight;
79 return *this;
80 }
81};
82
83template<class D1, class W1, class D2, class W2>
85 using std::swap;
86 swap(std::forward<D1>(p1.data),std::forward<D2>(p2.data));
87 swap(std::forward<W1>(p1.weight),std::forward<W2>(p2.weight));
88}
89
90template<class DataBatchType,class WeightBatchType>
92private:
95public:
96 DataBatchType data;
97 WeightBatchType weight;
98
99 typedef WeightedDataPair<
100 typename DataBatchTraits::value_type,
101 typename WeightBatchTraits::value_type
102 > value_type;
103 typedef WeightedDataPair<
104 decltype(getBatchElement(std::declval<DataBatchType&>(),0)),
105 decltype(getBatchElement(std::declval<WeightBatchType&>(),0))
107 typedef WeightedDataPair<
108 decltype(getBatchElement(std::declval<typename std::add_const<DataBatchType>::type&>(),0)),
109 decltype(getBatchElement(std::declval<typename std::add_const<WeightBatchType>::type&>(),0))
113
114 template<class D, class W>
116 D&& data,
117 W&& weight
118 ):data(data),weight(weight){}
119
120 template<class Pair>
124
125 template<class I, class L>
127 data = batch.data;
128 weight = batch.weight;
129 return *this;
130 }
131
132 std::size_t size()const{
133 return DataBatchTraits::size(data);
134 }
135
137 return iterator(*this,0);
138 }
140 return const_iterator(*this,0);
141 }
142
144 return iterator(*this,size());
145 }
147 return const_iterator(*this,size());
148 }
149
150 reference operator[](std::size_t i){
152 }
156};
157
158template<class D1, class W1, class D2, class W2>
160 using std::swap;
161 swap(p1.data,p2.data);
162 swap(p1.weight,p2.weight);
163}
164
165template<class DataType, class WeightType>
166struct Batch<WeightedDataPair<DataType, WeightType> >
167: public detail::SimpleBatch<
168 WeightedDataBatch<typename detail::element_to_batch<DataType>::type, typename detail::element_to_batch<WeightType>::type>
169>{};
170
171template<class DataType, class WeightType>
172struct BatchTraits<WeightedDataBatch<DataType, WeightType> >{
173 typedef typename detail::batch_to_element<DataType>::type DataElem;
174 typedef typename detail::batch_to_element<WeightType>::type WeightElem;
176};
177
178
179namespace detail{
180template <class DataContainerT>
181class BaseWeightedDataset : public ISerializable
182{
183public:
184 typedef typename DataContainerT::element_type DataType;
185 typedef double WeightType;
186 typedef DataContainerT DataContainer;
187 typedef Data<WeightType> WeightContainer;
188 typedef typename DataContainer::IndexSet IndexSet;
189
190 // TYPEDEFS FOR PAIRS
191 typedef WeightedDataPair<
192 DataType,
193 WeightType
194 > element_type;
195
196 typedef WeightedDataBatch<
197 typename DataContainer::batch_type,
198 typename WeightContainer::batch_type
199 > batch_type;
200
201 // TYPEDEFS FOR BATCH REFERENCES
202 typedef WeightedDataBatch<
203 typename DataContainer::batch_reference,
205 > batch_reference;
206 typedef WeightedDataBatch<
207 typename DataContainer::const_batch_reference,
209 > const_batch_reference;
210
211 typedef typename Batch<element_type>::reference element_reference;
212 typedef typename Batch<element_type>::const_reference const_element_reference;
213
214 typedef boost::iterator_range< detail::DataElementIterator<BaseWeightedDataset<DataContainer> > > element_range;
215 typedef boost::iterator_range< detail::DataElementIterator<BaseWeightedDataset<DataContainer> const> > const_element_range;
216 typedef detail::BatchRange<BaseWeightedDataset<DataContainer> > batch_range;
217 typedef detail::BatchRange<BaseWeightedDataset<DataContainer> const> const_batch_range;
218
219
220 ///\brief Returns the range of elements.
221 ///
222 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
223 ///element access via begin()/end() in which case data.elements() provides the correct interface
224 const_element_range elements()const{
225 return const_element_range(
226 detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,0,0,0),
227 detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,numberOfBatches(),0,numberOfElements())
228 );
229 }
230 ///\brief Returns therange of elements.
231 ///
232 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
233 ///element access via begin()/end() in which case data.elements() provides the correct interface
234 element_range elements(){
235 return element_range(
236 detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,0,0,0),
237 detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,numberOfBatches(),0,numberOfElements())
238 );
239 }
240
241 ///\brief Returns the range of batches.
242 ///
243 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
244 ///element access via begin()/end() in which case data.elements() provides the correct interface
245 const_batch_range batches()const{
246 return const_batch_range(this);
247 }
248 ///\brief Returns the range of batches.
249 ///
250 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
251 ///element access via begin()/end() in which case data.elements() provides the correct interface
252 batch_range batches(){
253 return batch_range(this);
254 }
255
256 ///\brief Returns the number of batches of the set.
257 std::size_t numberOfBatches() const{
258 return m_data.numberOfBatches();
259 }
260 ///\brief Returns the total number of elements.
261 std::size_t numberOfElements() const{
262 return m_data.numberOfElements();
263 }
264
265 ///\brief Check whether the set is empty.
266 bool empty() const{
267 return m_data.empty();
268 }
269
270 ///\brief Access to the stored data points as a separate container.
271 DataContainer const& data() const{
272 return m_data;
273 }
274 ///\brief Access to the stored data points as a separate container.
275 DataContainer& data(){
276 return m_data;
277 }
278
279 ///\brief Access to weights as a separate container.
280 WeightContainer const& weights() const{
281 return m_weights;
282 }
283 ///\brief Access to weights as a separate container.
284 WeightContainer& weights(){
285 return m_weights;
286 }
287
288 // CONSTRUCTORS
289
290 ///\brief Constructs an Empty data set.
291 BaseWeightedDataset()
292 {}
293
294 ///\brief Create an empty set with just the correct number of batches.
295 ///
296 /// The user must initialize the dataset after that by himself.
297 BaseWeightedDataset(std::size_t numBatches)
298 : m_data(numBatches),m_weights(numBatches)
299 {}
300
301 /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
302 ///
303 /// Optionally the desired batch Size can be set
304 ///
305 ///@param size the new size of the container
306 ///@param element the blueprint element from which to create the Container
307 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
308 BaseWeightedDataset(std::size_t size, element_type const& element, std::size_t batchSize)
309 : m_data(size,element.data,batchSize)
310 , m_weights(size,element.weight,batchSize)
311 {}
312
313 ///\brief Construction from data and a dataset rpresnting the weights
314 ///
315 /// Beware that when calling this constructor the organization of batches must be equal in both
316 /// containers. This Constructor will not reorganize the data!
317 BaseWeightedDataset(DataContainer const& data, Data<WeightType> const& weights)
318 : m_data(data), m_weights(weights)
319 {
320 SHARK_RUNTIME_CHECK(data.numberOfElements() == weights.numberOfElements(), "[ BaseWeightedDataset::WeightedUnlabeledData] number of data and number of weights must agree");
321#ifndef DNDEBUG
322 for(std::size_t i = 0; i != data.numberOfBatches(); ++i){
323 SIZE_CHECK(batchSize(data.batch(i)) == batchSize(weights.batch(i)));
324 }
325#endif
326 }
327
328 ///\brief Construction from data. All points get the same weight assigned
329 BaseWeightedDataset(DataContainer const& data, double weight)
330 : m_data(data), m_weights(data.numberOfBatches())
331 {
332 for(std::size_t i = 0; i != numberOfBatches(); ++i){
333 m_weights.batch(i) = Batch<WeightType>::type(batchSize(m_data.batch(i)),weight);
334 }
335 }
336
337
338 // ELEMENT ACCESS
339 element_reference element(std::size_t i){
340 return *(detail::DataElementIterator<BaseWeightedDataset<DataContainer> >(this,0,0,0)+i);
341 }
342 const_element_reference element(std::size_t i) const{
343 return *(detail::DataElementIterator<BaseWeightedDataset<DataContainer> const>(this,0,0,0)+i);
344 }
345
346 // BATCH ACCESS
347 batch_reference batch(std::size_t i){
348 return batch_reference(m_data.batch(i),m_weights.batch(i));
349 }
350 const_batch_reference batch(std::size_t i) const{
351 return const_batch_reference(m_data.batch(i),m_weights.batch(i));
352 }
353
354 // MISC
355
356 /// from ISerializable
357 void read(InArchive& archive){
358 archive & m_data;
359 archive & m_weights;
360 }
361
362 /// from ISerializable
363 void write(OutArchive& archive) const{
364 archive & m_data;
365 archive & m_weights;
366 }
367
368 ///\brief This method makes the vector independent of all siblings and parents.
369 virtual void makeIndependent(){
370 m_weights.makeIndependent();
371 m_data.makeIndependent();
372 }
373
374 ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
375 virtual void shuffle(){
376 shark::shuffle(this->elements().begin(),this->elements().end(), random::globalRng);
377 }
378
379 void splitBatch(std::size_t batch, std::size_t elementIndex){
380 m_data.splitBatch(batch,elementIndex);
381 m_weights.splitBatch(batch,elementIndex);
382 }
383
384 /// \brief Appends the contents of another data object to the end
385 ///
386 /// The batches are not copied but now referenced from both datasets. Thus changing the appended
387 /// dataset might change this one as well.
388 void append(BaseWeightedDataset const& other){
389 m_data.append(other.m_data);
390 m_weights.append(other.m_weights);
391 }
392
393
394 ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
395 ///
396 ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
397 ///However the sum of all batch sizes must be equal to the current number of elements
398 template<class Range>
399 void repartition(Range const& batchSizes){
400 m_data.repartition(batchSizes);
401 m_weights.repartition(batchSizes);
402 }
403
404 /// \brief Creates a vector with the batch sizes of every batch.
405 ///
406 /// This method can be used together with repartition to ensure
407 /// that two datasets have the same batch structure.
408 std::vector<std::size_t> getPartitioning()const{
409 return m_data.getPartitioning();
410 }
411
412 friend void swap( BaseWeightedDataset& a, BaseWeightedDataset& b){
413 swap(a.m_data,b.m_data);
414 swap(a.m_weights,b.m_weights);
415 }
416
417
418 // SUBSETS
419
420 ///\brief Fill in the subset defined by the list of indices.
421 BaseWeightedDataset indexedSubset(IndexSet const& indices) const{
422 BaseWeightedDataset subset;
423 subset.m_data = m_data.indexedSubset(indices);
424 subset.m_weights = m_weights.indexedSubset(indices);
425 return subset;
426 }
427private:
428 DataContainer m_data; /// point data
429 WeightContainer m_weights; /// weight data
430};
431}
432
433///
434/// \brief Weighted data set for unsupervised learning
435///
436/// The WeightedUnlabeledData class extends UnlabeledData for the
437/// representation of data. In addition it holds and provides access to the corresponding weights.
438///
439/// WeightedUnlabeledData tries to mimic the underlying data as pairs of data points and weights.
440/// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
441/// one access the input batch by batch(i).data and the weights by batch(i).weight
442///
443///this also holds true for single element access using operator(). Be aware, that direct access to element is
444///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
445template <class DataT>
446class WeightedUnlabeledData : public detail::BaseWeightedDataset <UnlabeledData<DataT> >
447{
448private:
449 typedef detail::BaseWeightedDataset <UnlabeledData<DataT> > base_type;
450public:
451 using base_type::data;
452 using base_type::weights;
453 typedef typename base_type::DataType DataType;
454 typedef typename base_type::WeightType WeightType;
455 typedef typename base_type::element_type element_type;
456 typedef DataT InputType;
457
459
460 // CONSTRUCTORS
461
462 ///\brief Empty data set.
465
466 ///\brief Create an empty set with just the correct number of batches.
467 ///
468 /// The user must initialize the dataset after that by himself.
469 WeightedUnlabeledData(std::size_t numBatches)
470 : base_type(numBatches)
471 {}
472
473 /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
474 ///
475 /// Optionally the desired batch Size can be set
476 ///
477 ///@param size the new size of the container
478 ///@param element the blueprint element from which to create the Container
479 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
480 WeightedUnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
481 : base_type(size,element,batchSize){}
482
483 ///\brief Construction from data.
484 ///
485 /// Beware that when calling this constructor the organization of batches must be equal in both
486 /// containers. This Constructor will not reorganize the data!
488 : base_type(data,weights)
489 {}
490
491 ///\brief Construction from data and a constant weight for all elements
493 : base_type(data,weight)
494 {}
495
496 //we additionally add the two below for compatibility with UnlabeledData
497
498 ///\brief Access to the inputs as a separate container.
500 return data();
501 }
502 ///\brief Access to the inputs as a separate container.
504 return data();
505 }
506
507 ///\brief Returns the Shape of the data.
508 Shape const& shape() const{
509 return data().shape();
510 }
511
512 ///\brief Returns the Shape of the data.
514 return data().shape();
515 }
516 ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
517 ///
518 ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
519 ///this to work.
520 WeightedUnlabeledData splice(std::size_t batch){
521 return WeightedUnlabeledData(data().splice(batch),weights().splice(batch));
522 }
523
525 swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
526 }
527};
528
529///brief Outstream of elements for weighted data.
530template<class T>
531std::ostream &operator << (std::ostream &stream, const WeightedUnlabeledData<T>& d) {
532 for(auto elem: d.elements())
533 stream << elem.weight << " [" << elem.data<<"]"<< "\n";
534 return stream;
535}
536
537/// \brief creates a weighted unweighted data object from two ranges, representing data and weights
538template<class DataRange, class WeightRange>
539typename boost::disable_if<
540 boost::is_arithmetic<WeightRange>,
541 WeightedUnlabeledData<
542 typename boost::range_value<DataRange>::type
543 >
544>::type createUnlabeledDataFromRange(DataRange const& data, WeightRange const& weights, std::size_t batchSize = 0){
545
546 SHARK_RUNTIME_CHECK(batchSize(data) == batchSize(weights),"Number of datapoints and number of weights must agree");
547
548 typedef typename boost::range_value<DataRange>::type Data;
549
550 if (batchSize == 0)
552
556 );
557}
558
559
560///
561/// \brief Weighted data set for supervised learning
562///
563/// The WeightedLabeledData class extends LabeledData for the
564/// representation of data. In addition it holds and provides access to the corresponding weights.
565///
566/// WeightedLabeledData tries to mimic the underlying data as pairs of data tuples(input,label) and weights.
567/// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
568/// one access the databatch by batch(i).data and the weights by batch(i).weight. to access the points and labels
569/// use batch(i).data.input and batch(i).data.label
570///
571///this also holds true for single element access using operator(). Be aware, that direct access to element is
572///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
573///
574/// It is possible to gains everal views on the set. one can either get access to inputs, labels and weights separately
575/// or gain access to the unweighted dataset of inputs and labels. Additionally the sets support on-the-fly creation
576/// of the (inputs,weights) subset for unsupervised weighted learning
577template <class InputT, class LabelT>
578class WeightedLabeledData : public detail::BaseWeightedDataset <LabeledData<InputT,LabelT> >
579{
580private:
581 typedef detail::BaseWeightedDataset <LabeledData<InputT,LabelT> > base_type;
582public:
583 typedef typename base_type::DataType DataType;
584 typedef typename base_type::WeightType WeightType;
585 typedef InputT InputType;
586 typedef LabelT LabelType;
587 typedef typename base_type::element_type element_type;
588
589 using base_type::data;
590 using base_type::weights;
591
593
594 // CONSTRUCTORS
595
596 ///\brief Empty data set.
599
600 ///\brief Create an empty set with just the correct number of batches.
601 ///
602 /// The user must initialize the dataset after that by himself.
603 WeightedLabeledData(std::size_t numBatches)
604 : base_type(numBatches)
605 {}
606
607 /// \brief Construtor using a single element as blueprint to create a dataset with a specified number of elements.
608 ///
609 /// Optionally the desired batch Size can be set
610 ///
611 ///@param size the new size of the container
612 ///@param element the blueprint element from which to create the Container
613 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
614 WeightedLabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
615 : base_type(size,element,batchSize){}
616
617 ///\brief Construction from data.
618 ///
619 /// Beware that when calling this constructor the organization of batches must be equal in both
620 /// containers. This Constructor will not reorganize the data!
622 : base_type(data,weights)
623 {}
624
625 ///\brief Construction from data and a constant weight for all elements
627 : base_type(data,weight)
628 {}
629
630 ///\brief Access to the inputs as a separate container.
632 return data().inputs();
633 }
634 ///\brief Access to the inputs as a separate container.
636 return data().inputs();
637 }
638
639 ///\brief Access to the labels as a separate container.
640 Data<LabelType> const& labels() const{
641 return data().labels();
642 }
643 ///\brief Access to the labels as a separate container.
645 return data().labels();
646 }
647
648 ///\brief Returns the Shape of the inputs.
649 Shape const& inputShape() const{
650 return inputs().shape();
651 }
652
653 ///\brief Returns the Shape of the inputs.
655 return inputs().shape();
656 }
657
658 ///\brief Returns the Shape of the labels.
659 Shape const& labelShape() const{
660 return labels().shape();
661 }
662
663 ///\brief Returns the Shape of the labels.
665 return labels().shape();
666 }
667
668 /// \brief Constructs an WeightedUnlabeledData object for the inputs.
672
673 ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
674 ///
675 ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
676 ///this to work.
677 WeightedLabeledData splice(std::size_t batch){
678 return WeightedLabeledData(data().splice(batch),weights().splice(batch));
679 }
680
682 swap(static_cast<base_type&>(a),static_cast<base_type&>(b));
683 }
684};
685
686///brief Outstream of elements for weighted labeled data.
687template<class T, class U>
688std::ostream &operator << (std::ostream &stream, const WeightedLabeledData<T, U>& d) {
689 for(auto elem: d.elements())
690 stream << elem.weight <<" ("<< elem.data.label << " [" << elem.data.input<<"] )"<< "\n";
691 return stream;
692}
693
694//Stuff for Dimensionality and querying of basic information
695
696inline std::size_t numberOfClasses(WeightedUnlabeledData<unsigned int> const& labels){
697 return numberOfClasses(labels.data());
698}
699
700///\brief Returns the number of members of each class in the dataset.
701inline std::vector<std::size_t> classSizes(WeightedUnlabeledData<unsigned int> const& labels){
702 return classSizes(labels.data());
703}
704
705///\brief Return the dimnsionality of points of a weighted dataset
706template <class InputType>
708 return dataDimension(dataset.data());
709}
710
711///\brief Return the input dimensionality of a weighted labeled dataset.
712template <class InputType, class LabelType>
714 return dataDimension(dataset.inputs());
715}
716
717///\brief Return the label/output dimensionality of a labeled dataset.
718template <class InputType, class LabelType>
720 return dataDimension(dataset.labels());
721}
722///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
723template <class InputType>
725 return numberOfClasses(dataset.labels());
726}
727
728///\brief Returns the number of members of each class in the dataset.
729template<class InputType, class LabelType>
730inline std::vector<std::size_t> classSizes(WeightedLabeledData<InputType, LabelType> const& dataset){
731 return classSizes(dataset.labels());
732}
733
734///\brief Returns the total sum of weights.
735template<class InputType>
737 double weightSum = 0;
738 for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
739 weightSum += sum(dataset.batch(i).weight);
740 }
741 return weightSum;
742}
743///\brief Returns the total sum of weights.
744template<class InputType, class LabelType>
746 double weightSum = 0;
747 for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
748 weightSum += sum(dataset.batch(i).weight);
749 }
750 return weightSum;
751}
752
753/// \brief Computes the cumulative weight of every class.
754template<class InputType>
756 RealVector weights(numberOfClasses(dataset),0.0);
757 for(auto const& elem: dataset.elements()){
758 weights(elem.data.label) += elem.weight;
759 }
760 return weights;
761}
762
763//creation of weighted datasets
764
765/// \brief creates a weighted unweighted data object from two ranges, representing data and weights
766template<class InputRange,class LabelRange, class WeightRange>
767typename boost::disable_if<
768 boost::is_arithmetic<WeightRange>,
769 WeightedLabeledData<
770 typename boost::range_value<InputRange>::type,
771 typename boost::range_value<LabelRange>::type
772 >
773>::type createLabeledDataFromRange(InputRange const& inputs, LabelRange const& labels, WeightRange const& weights, std::size_t batchSize = 0){
774
775 SHARK_RUNTIME_CHECK(batchSize(inputs) == batchSize(labels),
776 "number of inputs and number of labels must agree");
777 SHARK_RUNTIME_CHECK(batchSize(inputs) == batchSize(weights),
778 "number of data points and number of weights must agree");
779 typedef typename boost::range_value<InputRange>::type InputType;
780 typedef typename boost::range_value<LabelRange>::type LabelType;
781
782 if (batchSize == 0)
784
788 );
789}
790
791/// \brief Creates a bootstrap partition of a labeled dataset and returns it using weighting.
792///
793/// Bootstrapping resamples the dataset by drawing a set of points with
794/// replacement. Thus the sampled set will contain some points multiple times
795/// and some points not at all. Bootstrapping is usefull to obtain unbiased
796/// measurements of the mean and variance of an estimator.
797///
798/// Optionally the size of the bootstrap (that is, the number of sampled points)
799/// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
800template<class InputType, class LabelType>
803 std::size_t bootStrapSize = 0
804){
805 if(bootStrapSize == 0)
806 bootStrapSize = dataset.numberOfElements();
807
808 WeightedLabeledData<InputType,LabelType> bootstrapSet(dataset,0.0);
809
810 for(std::size_t i = 0; i != bootStrapSize; ++i){
811 std::size_t index = random::discrete(random::globalRng, std::size_t(0),bootStrapSize-1);
812 bootstrapSet.element(index).weight += 1.0;
813 }
814 bootstrapSet.inputShape() = dataset.inputShape();
815 bootstrapSet.labelShape() = dataset.labelShape();
816 return bootstrapSet;
817}
818
819/// \brief Creates a bootstrap partition of an unlabeled dataset and returns it using weighting.
820///
821/// Bootstrapping resamples the dataset by drawing a set of points with
822/// replacement. Thus the sampled set will contain some points multiple times
823/// and some points not at all. Bootstrapping is usefull to obtain unbiased
824/// measurements of the mean and variance of an estimator.
825///
826/// Optionally the size of the bootstrap (that is, the number of sampled points)
827/// can be set. By default it is 0, which indicates that it is the same size as the original dataset.
828template<class InputType>
830 UnlabeledData<InputType> const& dataset,
831 std::size_t bootStrapSize = 0
832){
833 if(bootStrapSize == 0)
834 bootStrapSize = dataset.numberOfElements();
835
836 WeightedUnlabeledData<InputType> bootstrapSet(dataset,0.0);
837
838 for(std::size_t i = 0; i != bootStrapSize; ++i){
839 std::size_t index = random::discrete(random::globalRng, std::size_t(0),bootStrapSize-1);
840 bootstrapSet.element(index).weight += 1.0;
841 }
842 bootstrapSet.shape() = dataset.shape();
843 return bootstrapSet;
844}
845
846}
847
848#endif