Dataset.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Data for (un-)supervised learning.
6 *
7 *
8 * \par
9 * This file provides containers for data used by the models, loss
10 * functions, and learning algorithms (trainers). The reason for
11 * dedicated containers of this type is that data often need to be
12 * split into subsets, such as training and test data, or folds in
13 * cross-validation. The containers in this file provide memory
14 * efficient mechanisms for managing and providing such subsets.
15 *
16 *
17 *
18 *
19 * \author O. Krause, T. Glasmachers
20 * \date 2010-2014
21 *
22 *
23 * \par Copyright 1995-2017 Shark Development Team
24 *
25 * <BR><HR>
26 * This file is part of Shark.
27 * <https://shark-ml.github.io/Shark/>
28 *
29 * Shark is free software: you can redistribute it and/or modify
30 * it under the terms of the GNU Lesser General Public License as published
31 * by the Free Software Foundation, either version 3 of the License, or
32 * (at your option) any later version.
33 *
34 * Shark is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
37 * GNU Lesser General Public License for more details.
38 *
39 * You should have received a copy of the GNU Lesser General Public License
40 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
41 *
42 */
43//===========================================================================
44
45#ifndef SHARK_DATA_DATASET_H
46#define SHARK_DATA_DATASET_H
47
48#include <boost/range/iterator_range.hpp>
49
51#include <shark/Core/OpenMP.h>
53#include <boost/iterator/transform_iterator.hpp>
54#include <shark/Core/Random.h>
55#include <shark/Core/Shape.h>
56#include "Impl/Dataset.inl"
57
58namespace shark {
59
60
61///
62/// \brief Data container.
63///
64/// The Data class is Shark's container for machine learning data.
65/// This container (and its sub-classes) is used for input data,
66/// labels, and model outputs.
67///
68/// \par
69/// The Data container organizes the data it holds in batches.
70/// This means, that it tries to find a good data representation for a whole
71/// set of, for example 100 data points, at the same time. If the type of data it stores
72/// is for example RealVector, the batches of this type are RealMatrices. This is good because most often
73/// operations on the whole matrix are faster than operations on the separate vectors.
74/// Nearly all operations of the set have to be interpreted in terms of the batch. Therefore the iterator interface will
75/// give access to the batches but not to single elements. For this separate element_iterators and const_element_iterators
76/// can be used.
77///\par
78///When you need to explicitely iterate over all elements, you can use:
79///\code
80/// Data<RealVector> data;
81/// for(auto elem: data.elements()){
82/// std::cout<<*pos<<" ";
83/// auto ref=*pos;
84/// ref*=2;
85/// std::cout<<*pos<<std::endl;
86///}
87///\endcode
88/// \par
89/// Element wise accessing of elements is usually slower than accessing the batches. If possible, use direct batch access, or
90/// at least use the iterator interface or the for loop above to iterate over all elements. Random access to single elements is linear time, so use it wisely.
91/// Of course, when you want to use batches, you need to know the actual batch type. This depends on the actual type of the input.
92/// here are the rules:
93/// if the input is an arithmetic type like int or double, the result will be a vector of this
94/// (i.e. double->RealVector or Int->IntVector).
95/// For vectors the results are matrices as mentioned above. If the vector is sparse, so is the matrix.
96/// And for everything else the batch type is just a std::vector of the type, so no optimization can be applied.
97/// \par
98/// When constructing the container the batchSize can be set. If it is not set by the user the default batchSize is chosen. A BatchSize of 0
99/// corresponds to putting all data into a single batch. Beware that not only the data needs storage but also
100/// the various models during computation. So the actual amount of space to compute a batch can greatly exceed the batch size.
101///
102/// An additional feature of the Data class is that it can be used to create lazy subsets. So the batches of a dataset
103/// can be shared between various instances of the data class without additional memory overhead.
104///
105///
106///\warning Be aware --especially for derived containers like LabeledData-- that the set does not enforce structural consistency.
107/// When you change the structure of the data part for example by directly changing the size of the batches, the size of the labels is not
108/// enforced to change accordingly. Also when creating subsets of a set changing the parent will change it's siblings and conversely. The programmer
109/// needs to ensure structural integrity!
110/// For example this is dangerous:
111/// \code
112/// void function(Data<unsigned int>& data){
113/// Data<unsigned int> newData(...);
114/// data=newData;
115/// }
116/// \endcode
117/// When data was originally a labeledData object, and newData has a different batch structure than data, this will lead to structural inconsistencies.
118/// When function is rewritten such that newData has the same structure as data, this code is perfectly fine. The best way to get around this problem is
119/// by rewriting the code as:
120/// \code
121/// Data<unsigned int> function(){
122/// Data<unsigned int> newData(...);
123/// return newData;
124/// }
125/// \endcode
126///\todo expand docu
127template <class Type>
128class Data : public ISerializable
129{
130protected:
131 typedef detail::SharedContainer<Type> Container;
132
134 Shape m_shape;///< shape of a datapoint
135public:
136 /// \brief Defines the default batch size of the Container.
137 ///
138 /// Zero means: unlimited
139 BOOST_STATIC_CONSTANT(std::size_t, DefaultBatchSize = 256);
140
141 typedef typename Container::BatchType batch_type;
142 typedef batch_type& batch_reference;
143 typedef batch_type const& const_batch_reference;
144
145 typedef Type element_type;
148
149 typedef std::vector<std::size_t> IndexSet;
150
151 /// \brief Two containers compare equal if they share the same data.
152 template <class T> bool operator == (const Data<T>& rhs) {
153 return (m_data == rhs.m_data);
154 }
155
156 /// \brief Two containers compare unequal if they don't share the same data.
157 template <class T> bool operator != (const Data<T>& rhs) {
158 return (! (*this == rhs));
159 }
160
161 template <class InputT, class LabelT> friend class LabeledData;
162
163 // RANGES
164 typedef boost::iterator_range< detail::DataElementIterator<Data<Type> > > element_range;
165 typedef boost::iterator_range< detail::DataElementIterator<Data<Type> const> > const_element_range;
166 typedef detail::BatchRange<Data<Type> > batch_range;
167 typedef detail::BatchRange<Data<Type> const> const_batch_range;
168
169
170 ///\brief Returns the range of elements.
171 ///
172 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
173 ///element access via begin()/end() in which case data.elements() provides the correct interface
175 return const_element_range(
176 detail::DataElementIterator<Data<Type> const>(this,0,0,0),
177 detail::DataElementIterator<Data<Type> const>(this,numberOfBatches(),0,numberOfElements())
178 );
179 }
180 ///\brief Returns therange of elements.
181 ///
182 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
183 ///element access via begin()/end() in which case data.elements() provides the correct interface
185 return element_range(
186 detail::DataElementIterator<Data<Type> >(this,0,0,0),
187 detail::DataElementIterator<Data<Type> >(this,numberOfBatches(),0,numberOfElements())
188 );
189 }
190
191 ///\brief Returns the range of batches.
192 ///
193 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
194 ///element access via begin()/end() in which case data.elements() provides the correct interface
196 return const_batch_range(this);
197 }
198 ///\brief Returns the range of batches.
199 ///
200 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
201 ///element access via begin()/end() in which case data.elements() provides the correct interface
203 return batch_range(this);
204 }
205
206 ///\brief Returns the number of batches of the set.
207 std::size_t numberOfBatches() const{
208 return m_data.size();
209 }
210 ///\brief Returns the total number of elements.
211 std::size_t numberOfElements() const{
212 return m_data.numberOfElements();
213 }
214
215
216 ///\brief Returns the shape of the elements in the dataset.
217 Shape const& shape() const{
218 return m_shape;
219 }
220
221 ///\brief Returns the shape of the elements in the dataset.
223 return m_shape;
224 }
225
226 ///\brief Check whether the set is empty.
227 bool empty() const{
228 return m_data.empty();
229 }
230
231 // ELEMENT ACCESS
233 return *(detail::DataElementIterator<Data<Type> >(this,0,0,0)+i);
234 }
235 const_element_reference element(std::size_t i) const{
236 return *(detail::DataElementIterator<Data<Type> const>(this,0,0,0)+i);
237 }
238
239 // BATCH ACCESS
240 batch_reference batch(std::size_t i){
241 return *(m_data.begin()+i);
242 }
243 const_batch_reference batch(std::size_t i) const{
244 return *(m_data.begin()+i);
245 }
246
247 // CONSTRUCTORS
248
249 ///\brief Constructor which constructs an empty set
250 Data(){ }
251
252 ///\brief Construct a dataset with empty batches.
253 explicit Data(std::size_t numBatches) : m_data( numBatches )
254 { }
255
256 ///\brief Construction with size and a single element
257 ///
258 /// Optionally the desired batch Size can be set
259 ///
260 ///@param size the new size of the container
261 ///@param element the blueprint element from which to create the Container
262 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
263 explicit Data(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
264 : m_data(size,element,batchSize)
265 { }
266
267 // MISC
268
269 void read(InArchive& archive){
270 archive >> m_data;
271 archive >> m_shape;
272 }
273
274 void write(OutArchive& archive) const{
275 archive << m_data;
276 archive << m_shape;
277 }
278 ///\brief This method makes the vector independent of all siblings and parents.
279 virtual void makeIndependent(){
280 m_data.makeIndependent();
281 }
282
283
284 // METHODS TO ALTER BATCH STRUCTURE
285
286 void splitBatch(std::size_t batch, std::size_t elementIndex){
287 m_data.splitBatch(m_data.begin()+batch,elementIndex);
288 }
289
290 ///\brief Splits the container into two independent parts. The front part remains in the container, the back part is returned.
291 ///
292 ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
293 ///this to work.
294 Data splice(std::size_t batch){
295 Data right;
296 right.m_data = m_data.splice(m_data.begin()+batch);
297 right.m_shape = m_shape;
298 return right;
299 }
300
301 /// \brief Appends the contents of another data object to the end
302 ///
303 /// The batches are not copied but now referenced from both datasets. Thus changing the appended
304 /// dataset might change this one as well.
305 void append(Data const& other){
306 m_data.append(other.m_data);
307 }
308
310 m_data.push_back(batch);
311 }
312
313 ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
314 ///
315 ///After the operation the container will contain batchSizes.size() batchs with the i-th batch having size batchSize[i].
316 ///However the sum of all batch sizes must be equal to the current number of elements
317 template<class Range>
318 void repartition(Range const& batchSizes){
319 m_data.repartition(batchSizes);
320 }
321
322 /// \brief Creates a vector with the batch sizes of every batch.
323 ///
324 /// This method can be used together with repartition to ensure
325 /// that two datasets have the same batch structure.
326 std::vector<std::size_t> getPartitioning()const{
327 return m_data.getPartitioning();
328 }
329
330
331 /// \brief Reorders elements across batches
332 ///
333 /// Takes a vector of indices so that the ith element is moved to index[i].
334 /// This will create a temporary copy of the dataset and thus requires a double amount of memory compared to the original dataset
335 /// during construction.
336 template<class Range>
337 void reorderElements(Range const& indices){
338 Data dataCopy(numberOfBatches());
339 dataCopy.shape() = shape();
340
341 std::vector<Type> batch_elements;
342 auto indexPos = indices.begin();
343 auto elemBegin = elements().begin();
344 for(std::size_t b = 0; b != numberOfBatches(); ++b){
345 std::size_t numElements = batchSize(batch(b));
346 batch_elements.clear();
347 for(std::size_t i = 0; i != numElements; ++i,++indexPos){
348 batch_elements.push_back(*(elemBegin+*indexPos));
349 }
350 dataCopy.batch(b) = createBatch<Type>(batch_elements);
351 }
352 *this = dataCopy;
353 }
354
355 // SUBSETS
356
357 ///\brief Fill in the subset defined by the list of indices as well as its complement.
358 void indexedSubset(IndexSet const& indices, Data& subset, Data& complement) const{
359 IndexSet comp;
360 detail::complement(indices,m_data.size(),comp);
361 subset.m_data=Container(m_data,indices);
362 complement.m_data=Container(m_data,comp);
363 }
364
365 Data indexedSubset(IndexSet const& indices) const{
366 Data subset;
367 subset.m_data = Container(m_data,indices);
368 subset.m_shape = m_shape;
369 return subset;
370 }
371
372 friend void swap(Data& a, Data& b){
373 swap(a.m_data,b.m_data);
374 std::swap(a.m_shape,b.m_shape);
375 }
376};
377
378/**
379 * \ingroup shark_globals
380 * @{
381 */
382
383/// Outstream of elements.
384template<class T>
385std::ostream &operator << (std::ostream &stream, const Data<T>& d) {
386 for(auto elem:d.elements())
387 stream << elem << "\n";
388 return stream;
389}
390/** @} */
391
392/// \brief Data set for unsupervised learning.
393///
394/// The UnlabeledData class is basically a standard Data container
395/// with the special interpretation of its data point being
396/// "inputs" to a learning algorithm.
397template <class InputT>
398class UnlabeledData : public Data<InputT>
399{
400public:
401 typedef InputT element_type;
403 typedef element_type InputType;
404 typedef detail::SharedContainer<InputT> InputContainer;
405
406protected:
408public:
409
410 ///\brief Constructor.
412 { }
413
414 ///\brief Construction from data.
416 : base_type(points)
417 { }
418
419 ///\brief Construction with size and a single element
420 ///
421 /// Optionally the desired batch Size can be set
422 ///
423 ///@param size the new size of the container
424 ///@param element the blueprint element from which to create the Container
425 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
426 UnlabeledData(std::size_t size, element_type const& element, std::size_t batchSize = base_type::DefaultBatchSize)
428 { }
429
430 ///\brief Create an empty set with just the correct number of batches.
431 ///
432 /// The user must initialize the dataset after that by himself.
433 UnlabeledData(std::size_t numBatches)
434 : base_type(numBatches)
435 { }
436
437 ///\brief Construct a dataset with different batch sizes. it is a copy of the other dataset
438 UnlabeledData(UnlabeledData const& container, std::vector<std::size_t> batchSizes)
439 :base_type(container,batchSizes){}
440
441 /// \brief we allow assignment from Data.
443 static_cast<Data<InputT>& >(*this) = data;
444 return *this;
445 }
446
447 ///\brief Access to the base_type class as "inputs".
448 ///
449 /// Added for consistency with the LabeledData::labels() method.
451 return *this;
452 }
453
454 ///\brief Access to the base_type class as "inputs".
455 ///
456 /// Added for consistency with the LabeledData::labels() method.
457 UnlabeledData const& inputs() const{
458 return *this;
459 }
460
461 ///\brief Splits the container in two independent parts. The left part remains in the container, the right is stored as return type
462 ///
463 ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
464 ///this to work.
466 UnlabeledData right;
467 right.m_data = m_data.splice(m_data.begin()+batch);
468 right.m_shape = this->m_shape;
469 return right;
470 }
471
472 ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
473 void shuffle(){
474 std::vector<std::size_t> indices(this->numberOfElements());
475 std::iota(indices.begin(),indices.end(),0);
476 std::shuffle(indices.begin(),indices.end(), random::globalRng);
477 this->reorderElements(indices);
478 }
479};
480
481///
482/// \brief Data set for supervised learning.
483///
484/// The LabeledData class extends UnlabeledData for the
485/// representation of inputs. In addition it holds and
486/// provides access to the corresponding labels.
487///
488/// LabeledData tries to mimic the underlying data as pairs of input and label data.
489/// this means that when accessing a batch by calling batch(i) or choosing one of the iterators
490/// one access the input batch by batch(i).input and the labels by batch(i).label
491///
492///this also holds true for single element access using operator(). Be aware, that direct access to an element is
493///a linear time operation. So it is not advisable to iterate over the elements, but instead iterate over the batches.
494template <class InputT, class LabelT>
496{
497public:
498 typedef InputT InputType;
499 typedef LabelT LabelType;
503
504 static const std::size_t DefaultBatchSize = InputContainer::DefaultBatchSize;
505
506 // TYPEDEFS FOR PAIRS
507 typedef InputLabelBatch<
508 typename Batch<InputType>::type,
510 > batch_type;
511
512 typedef InputLabelPair<InputType,LabelType> element_type;
513
514 // TYPEDEFS FOR REFERENCES
515 typedef InputLabelBatch<
516 typename Batch<InputType>::type&,
517 typename Batch<LabelType>::type&
519 typedef InputLabelBatch<
520 typename Batch<InputType>::type const&,
521 typename Batch<LabelType>::type const&
523
524 typedef typename batch_reference::reference element_reference;
525 typedef typename const_batch_reference::const_reference const_element_reference;
526
527 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> > > element_range;
528 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> const> > const_element_range;
529 typedef detail::BatchRange<LabeledData<InputType,LabelType> > batch_range;
530 typedef detail::BatchRange<LabeledData<InputType,LabelType> const> const_batch_range;
531
532
533 ///\brief Returns the range of elements.
534 ///
535 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
536 ///element access via begin()/end() in which case data.elements() provides the correct interface
538 return const_element_range(
539 detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,0,0,0),
540 detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,numberOfBatches(),0,numberOfElements())
541 );
542 }
543 ///\brief Returns therange of elements.
544 ///
545 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
546 ///element access via begin()/end() in which case data.elements() provides the correct interface
548 return element_range(
549 detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,0,0,0),
550 detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,numberOfBatches(),0,numberOfElements())
551 );
552 }
553
554 ///\brief Returns the range of batches.
555 ///
556 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
557 ///element access via begin()/end() in which case data.elements() provides the correct interface
559 return const_batch_range(this);
560 }
561 ///\brief Returns the range of batches.
562 ///
563 ///It is compatible to boost::range and STL and can be used whenever an algorithm requires
564 ///element access via begin()/end() in which case data.elements() provides the correct interface
566 return batch_range(this);
567 }
568
569 ///\brief Returns the number of batches of the set.
570 std::size_t numberOfBatches() const{
571 return m_data.numberOfBatches();
572 }
573 ///\brief Returns the total number of elements.
574 std::size_t numberOfElements() const{
575 return m_data.numberOfElements();
576 }
577
578 ///\brief Check whether the set is empty.
579 bool empty() const{
580 return m_data.empty();
581 }
582
583 ///\brief Access to inputs as a separate container.
584 InputContainer const& inputs() const{
585 return m_data;
586 }
587 ///\brief Access to inputs as a separate container.
589 return m_data;
590 }
591
592 ///\brief Access to labels as a separate container.
593 LabelContainer const& labels() const{
594 return m_label;
595 }
596 ///\brief Access to labels as a separate container.
598 return m_label;
599 }
600
601 // CONSTRUCTORS
602
603 ///\brief Empty data set.
605 {}
606
607 ///\brief Create an empty set with just the correct number of batches.
608 ///
609 /// The user must initialize the dataset after that by himself.
610 LabeledData(std::size_t numBatches)
611 : m_data(numBatches),m_label(numBatches)
612 {}
613
614 ///
615 /// Optionally the desired batch Size can be set
616 ///
617 ///@param size the new size of the container
618 ///@param element the blueprint element from which to create the Container
619 ///@param batchSize the size of the batches. if this is 0, the size is unlimited
620 LabeledData(std::size_t size, element_type const& element, std::size_t batchSize = DefaultBatchSize)
621 : m_data(size,element.input,batchSize),
622 m_label(size,element.label,batchSize)
623 {}
624
625 ///\brief Construction from data.
626 ///
627 /// Beware that when calling this constructor the organization of batches must be equal in both
628 /// containers. This Constructor will not split the data!
631 {
632 SHARK_RUNTIME_CHECK(inputs.numberOfElements() == labels.numberOfElements(), "number of inputs and number of labels must agree");
633#ifndef DNDEBUG
634 for(std::size_t i = 0; i != inputs.numberOfBatches(); ++i){
636 }
637#endif
638 }
639 // ELEMENT ACCESS
641 return *(detail::DataElementIterator<LabeledData<InputType,LabelType> >(this,0,0,0)+i);
642 }
643 const_element_reference element(std::size_t i) const{
644 return *(detail::DataElementIterator<LabeledData<InputType,LabelType> const>(this,0,0,0)+i);
645 }
646
647 // BATCH ACCESS
648 batch_reference batch(std::size_t i){
650 }
651 const_batch_reference batch(std::size_t i) const{
653 }
654
655 ///\brief Returns the Shape of the inputs.
656 Shape const& inputShape() const{
657 return m_data.shape();
658 }
659
660 ///\brief Returns the Shape of the inputs.
662 return m_data.shape();
663 }
664
665 ///\brief Returns the Shape of the labels.
666 Shape const& labelShape() const{
667 return m_label.shape();
668 }
669
670 ///\brief Returns the Shape of the labels.
672 return m_label.shape();
673 }
674
675 // MISC
676
677 /// from ISerializable
678 void read(InArchive& archive){
679 archive & m_data;
680 archive & m_label;
681 }
682
683 /// from ISerializable
684 void write(OutArchive& archive) const{
685 archive & m_data;
686 archive & m_label;
687 }
688
689 ///\brief This method makes the vector independent of all siblings and parents.
690 virtual void makeIndependent(){
693 }
694
695 void splitBatch(std::size_t batch, std::size_t elementIndex){
696 m_data.splitBatch(batch,elementIndex);
697 m_label.splitBatch(batch,elementIndex);
698 }
699
700 ///\brief Splits the container into two independent parts. The left part remains in the container, the right is stored as return type
701 ///
702 ///Order of elements remain unchanged. The SharedVector is not allowed to be shared for
703 ///this to work.
707
708 /// \brief Appends the contents of another data object to the end
709 ///
710 /// The batches are not copied but now referenced from both datasets. Thus changing the appended
711 /// dataset might change this one as well.
712 void append(LabeledData const& other){
713 m_data.append(other.m_data);
714 m_label.append(other.m_label);
715 }
716
718 typename Batch<InputType>::type const& inputs,
719 typename Batch<LabelType>::type const& labels
720 ){
723 }
724
727 ){
728 push_back(batch.input,batch.label);
729 }
730
731
732 ///\brief Reorders the batch structure in the container to that indicated by the batchSizes vector
733 ///
734 ///After the operation the container will contain batchSizes.size() batches with the i-th batch having size batchSize[i].
735 ///However the sum of all batch sizes must be equal to the current number of elements
736 template<class Range>
737 void repartition(Range const& batchSizes){
738 m_data.repartition(batchSizes);
739 m_label.repartition(batchSizes);
740 }
741
742 /// \brief Creates a vector with the batch sizes of every batch.
743 ///
744 /// This method can be used together with repartition to ensure
745 /// that two datasets have the same batch structure.
746 std::vector<std::size_t> getPartitioning()const{
747 return m_data.getPartitioning();
748 }
749
750 friend void swap(LabeledData& a, LabeledData& b){
751 swap(a.m_data,b.m_data);
752 swap(a.m_label,b.m_label);
753 }
754
755 template<class Range>
756 void reorderElements(Range const& indices){
757 m_data.reorderElements(indices);
758 m_label.reorderElements(indices);
759 }
760
761 ///\brief shuffles all elements in the entire dataset (that is, also across the batches)
762 void shuffle(){
763 std::vector<std::size_t> indices(numberOfElements());
764 std::iota(indices.begin(),indices.end(),0);
765 std::shuffle(indices.begin(),indices.end(), random::globalRng);
766 reorderElements(indices);
767 }
768
769
770 // SUBSETS
771
772 ///\brief Fill in the subset defined by the list of indices.
773 LabeledData indexedSubset(IndexSet const& indices) const{
774 return LabeledData(m_data.indexedSubset(indices),m_label.indexedSubset(indices));
775 }
776protected:
777 InputContainer m_data; /// point data
778 LabelContainer m_label; /// label data
779};
780
781/// specialized template for classification with unsigned int labels
783
784/// specialized template for regression with RealVector labels
786
787/// specialized template for classification with unsigned int labels and sparse data
789
790template<class Functor, class T>
794
795namespace detail{
796template<class T>
797struct InferShape{
798 static Shape infer(T const&){return {};}
799};
800
801template<class T>
802struct InferShape<Data<blas::vector<T> > >{
803 static Shape infer(Data<blas::vector<T> > const& f){
804 return {f.element(0).size()};
805 }
806};
807
808template<class T>
809struct InferShape<Data<blas::compressed_vector<T> > >{
810 static Shape infer(Data<blas::compressed_vector<T> > const& f){
811 return {f.element(0).size()};
812 }
813};
814
815}
816
817/**
818 * \addtogroup shark_globals
819 * @{
820 */
821
822/// \brief creates a data object from a range of elements
823template<class Range>
824Data<typename Range::value_type>
825createDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
826 typedef typename Range::value_type Input;
827
828 if (maximumBatchSize == 0)
829 maximumBatchSize = Data<Input>::DefaultBatchSize;
830
831 std::size_t numPoints = inputs.size();
832 //first determine the optimal number of batches as well as batch size
833 std::size_t batches = numPoints / maximumBatchSize;
834 if(numPoints > batches*maximumBatchSize)
835 ++batches;
836 std::size_t optimalBatchSize=numPoints/batches;
837 std::size_t remainder = numPoints-batches*optimalBatchSize;
838 Data<Input> data(batches);
839
840 //now create the batches taking the remainder into account
841 auto start= inputs.begin();
842 for(std::size_t i = 0; i != batches; ++i){
843 std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
844 auto end = start+size;
845 data.batch(i) = createBatch<Input>(
846 boost::make_iterator_range(start,end)
847 );
848 start = end;
849 }
850 data.shape() = detail::InferShape<Data<Input> >::infer(data);
851 return data;
852}
853
854/// \brief creates a data object from a range of elements
855template<class Range>
856UnlabeledData<typename boost::range_value<Range>::type>
857createUnlabeledDataFromRange(Range const& inputs, std::size_t maximumBatchSize = 0){
858 return createDataFromRange(inputs,maximumBatchSize);
859}
860/// \brief creates a labeled data object from two ranges, representing inputs and labels
861template<class Range1, class Range2>
862LabeledData<
863 typename boost::range_value<Range1>::type,
864 typename boost::range_value<Range2>::type
865
866> createLabeledDataFromRange(Range1 const& inputs, Range2 const& labels, std::size_t maximumBatchSize = 0){
867 SHARK_RUNTIME_CHECK(inputs.size() == labels.size(),"Number of inputs and number of labels must agree");
868 typedef typename boost::range_value<Range1>::type Input;
869 typedef typename boost::range_value<Range2>::type Label;
870
871 if (maximumBatchSize == 0)
873
875 createDataFromRange(inputs,maximumBatchSize),
876 createDataFromRange(labels,maximumBatchSize)
877 );
878}
879
880///brief Outstream of elements for labeled data.
881template<class T, class U>
882std::ostream &operator << (std::ostream &stream, const LabeledData<T, U>& d) {
883 for(auto elem: d.elements())
884 stream << elem.input << " [" << elem.label <<"]"<< "\n";
885 return stream;
886}
887
888
889// FUNCTIONS FOR DIMENSIONALITY
890
891
892///\brief Return the number of classes of a set of class labels with unsigned int label encoding
893inline unsigned int numberOfClasses(Data<unsigned int> const& labels){
894 unsigned int classes = 0;
895 for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
896 classes = std::max(classes,*std::max_element(labels.batch(i).begin(),labels.batch(i).end()));
897 }
898 return classes+1;
899}
900
901///\brief Returns the number of members of each class in the dataset.
902inline std::vector<std::size_t> classSizes(Data<unsigned int> const& labels){
903 std::vector<std::size_t> classCounts(numberOfClasses(labels),0u);
904 for(std::size_t i = 0; i != labels.numberOfBatches(); ++i){
905 for(unsigned int elem: labels.batch(i)){
906 classCounts[elem]++;
907 }
908 }
909 return classCounts;
910}
911
912///\brief Return the dimensionality of a dataset.
913template <class InputType>
914std::size_t dataDimension(Data<InputType> const& dataset){
915 SHARK_ASSERT(dataset.numberOfElements() > 0);
916 return dataset.element(0).size();
917}
918
919///\brief Return the input dimensionality of a labeled dataset.
920template <class InputType, class LabelType>
922 return dataDimension(dataset.inputs());
923}
924
925///\brief Return the label/output dimensionality of a labeled dataset.
926template <class InputType, class LabelType>
928 return dataDimension(dataset.labels());
929}
930///\brief Return the number of classes (highest label value +1) of a classification dataset with unsigned int label encoding
931template <class InputType>
933 return numberOfClasses(dataset.labels());
934}
935///\brief Returns the number of members of each class in the dataset.
936template<class InputType, class LabelType>
937inline std::vector<std::size_t> classSizes(LabeledData<InputType, LabelType> const& dataset){
938 return classSizes(dataset.labels());
939}
940
941// TRANSFORMATION
942///\brief Transforms a dataset using a Functor f and returns the transformed result.
943///
944/// this version is used, when the Functor supports only element-by-element transformations
945template<class T,class Functor>
946typename boost::lazy_disable_if<
947 CanBeCalled<Functor,typename Data<T>::batch_type>,
948 TransformedData<Functor,T>
949>::type
950transform(Data<T> const& data, Functor f){
951 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
952 int batches = (int) data.numberOfBatches();
953 Data<ResultType> result(batches);
954 SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
955 result.batch(i)= createBatch<ResultType>(
956 boost::make_transform_iterator(batchBegin(data.batch(i)), f),
957 boost::make_transform_iterator(batchEnd(data.batch(i)), f)
958 );
959 result.shape() = detail::InferShape<Data<ResultType> >::infer(result);
960 return result;
961}
962
963///\brief Transforms a dataset using a Functor f and returns the transformed result.
964///
965/// this version is used, when the Functor supports batch-by-batch transformations
966template<class T,class Functor>
967typename boost::lazy_enable_if<
968 CanBeCalled<Functor,typename Data<T>::batch_type>,
969 TransformedData<Functor,T>
970>::type
971transform(Data<T> const& data, Functor const& f){
972 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
973 int batches = (int) data.numberOfBatches();
974 Data<ResultType> result(batches);
975 SHARK_PARALLEL_FOR(int i = 0; i < batches; ++i)
976 result.batch(i)= f(data.batch(i));
977 Shape shape = detail::InferShape<Functor>::infer(f);
978 if(shape == Shape()){
979 shape = detail::InferShape<Data<ResultType> >::infer(result);
980 }
981 result.shape() = shape;
982 return result;
983}
984
985///\brief Transforms the inputs of a dataset and return the transformed result.
986template<class I,class L, class Functor>
987LabeledData<typename detail::TransformedDataElement<Functor,I >::type, L >
988transformInputs(LabeledData<I,L> const& data, Functor const& f){
990 return DatasetType(transform(data.inputs(),f),data.labels());
991}
992///\brief Transforms the labels of a dataset and returns the transformed result.
993template<class I,class L, class Functor>
994LabeledData<I,typename detail::TransformedDataElement<Functor,L >::type >
995transformLabels(LabeledData<I,L> const& data, Functor const& f){
997 return DatasetType(data.inputs(),transform(data.labels(),f));
998}
999
1000///\brief Creates a copy of a dataset selecting only a certain set of features.
1001template<class T, class FeatureSet>
1002Data<blas::vector<T> > selectFeatures(Data<blas::vector<T> > const& data,FeatureSet const& features){
1003 auto select = [&](blas::matrix<T> const& input){
1004 blas::matrix<T> output(input.size1(),features.size());
1005 for(std::size_t i = 0; i != input.size1(); ++i){
1006 for(std::size_t j = 0; j != features.size(); ++j){
1007 output(i,j) = input(i,features[j]);
1008 }
1009 }
1010 return output;
1011 };
1012 return transform(data,select);
1013}
1014
1015template<class T, class FeatureSet>
1017 return LabeledData<RealVector,T>(selectFeatures(data.inputs(),features), data.labels());
1018}
1019
1020
1021
1022/// \brief Removes the last part of a given dataset and returns a new split containing the removed elements
1023///
1024/// For this operation, the dataset is not allowed to be shared.
1025/// \brief data The dataset which should be splited
1026/// \brief index the first element to be split
1027/// \returns the set which contains the splitd element (right part of the given set)
1028template<class DatasetT>
1029DatasetT splitAtElement(DatasetT& data, std::size_t elementIndex){
1030 SIZE_CHECK(elementIndex<=data.numberOfElements());
1031
1032 std::size_t batchPos = 0;
1033 std::size_t batchStart = 0;
1034 while(batchStart + batchSize(data.batch(batchPos)) < elementIndex){
1035 batchStart += batchSize(data.batch(batchPos));
1036 ++batchPos;
1037 };
1038 std::size_t splitPoint = elementIndex-batchStart;
1039 if(splitPoint != 0){
1040 data.splitBatch(batchPos,splitPoint);
1041 ++batchPos;
1042 }
1043
1044 return data.splice(batchPos);
1045}
1046
1047
1048///\brief reorders the dataset such, that points are grouped by labels
1049///
1050/// The elements are not only reordered but the batches are also resized such, that every batch
1051/// only contains elements of one class. This method must be used in order to use binarySubproblem.
1052template<class I>
1054 std::vector<std::size_t > classCounts = classSizes(data);
1055 std::vector<std::size_t > partitioning;//new, optimal partitioning of the data according to the batch sizes
1056 std::vector<std::size_t > classStart;//at which batch the elements of the class are starting
1057 detail::batchPartitioning(classCounts, classStart, partitioning, batchSize);
1058
1059 data.repartition(partitioning);
1060
1061 std::vector<std::size_t> classIndex(classCounts.size(),0);
1062 for(std::size_t i = 1; i != classIndex.size();++i){
1063 classIndex[i] = classIndex[i-1] + classCounts[i-1];
1064 }
1065 std::vector<std::size_t> elemIndex(data.numberOfElements(), 0);
1066 std::size_t index = 0;
1067 for (auto const& elem: data.elements()){
1068 std::size_t c = elem.label;
1069 elemIndex[classIndex[c] ] = index;
1070 ++index;
1071 ++classIndex[c];
1072 }
1073 data.reorderElements(elemIndex);
1074}
1075
1076template<class I>
1078 LabeledData<I,unsigned int>const& data,
1079 unsigned int zeroClass,
1080 unsigned int oneClass
1081){
1082 std::vector<std::size_t> indexSet;
1083 std::size_t smaller = std::min(zeroClass,oneClass);
1084 std::size_t bigger = std::max(zeroClass,oneClass);
1085 std::size_t numBatches = data.numberOfBatches();
1086
1087 //find first class
1088 std::size_t start= 0;
1089 for(;start != numBatches && getBatchElement(data.batch(start),0).label != smaller;++start);
1090 SHARK_RUNTIME_CHECK(start != numBatches, "First class does not exist");
1091
1092 //copy batch indices of first class
1093 for(;start != numBatches && getBatchElement(data.batch(start),0).label == smaller; ++start)
1094 indexSet.push_back(start);
1095
1096 //find second class
1097
1098 for(;start != numBatches && getBatchElement(data.batch(start),0).label != bigger;++start);
1099 SHARK_RUNTIME_CHECK(start != numBatches, "Second class does not exist");
1100
1101 //copy batch indices of second class
1102 for(;start != numBatches && getBatchElement(data.batch(start),0).label == bigger; ++start)
1103 indexSet.push_back(start);
1104
1105 return transformLabels(data.indexedSubset(indexSet), [=](unsigned int label){return (unsigned int)(label == oneClass);});
1106}
1107
1108/// \brief Construct a binary (two-class) one-versus-rest problem from a multi-class problem.
1109///
1110/// \par
1111/// The function returns a new LabeledData object. The input part
1112/// coincides with the multi-class data, but the label part is replaced
1113/// with binary labels 0 and 1. All instances of the given class
1114/// (parameter oneClass) get a label of one, all others are assigned a
1115/// label of zero.
1116template<class I>
1118 LabeledData<I,unsigned int>const& data,
1119 unsigned int oneClass
1120){
1121 return transformLabels(data, [=](unsigned int label){return (unsigned int)(label == oneClass);});
1122}
1123
1124template <typename RowType>
1125RowType getColumn(Data<RowType> const& data, std::size_t columnID) {
1126 SHARK_ASSERT(dataDimension(data) > columnID);
1127 RowType column(data.numberOfElements());
1128 std::size_t rowCounter = 0;
1129 for(auto element: data.elements()){
1130 column(rowCounter) = element(columnID);
1131 rowCounter++;
1132 }
1133 return column;
1134}
1135
1136template <typename RowType>
1137void setColumn(Data<RowType>& data, std::size_t columnID, RowType newColumn) {
1138 SHARK_ASSERT(dataDimension(data) > columnID);
1139 SHARK_ASSERT(data.numberOfElements() == newColumn.size());
1140 std::size_t rowCounter = 0;
1141 for(auto element: data.elements()){
1142 element(columnID) = newColumn(rowCounter);
1143 rowCounter++;
1144 }
1145}
1146
1147/** @*/
1148}
1149
1150#endif