DataView.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Fast lookup for elements in constant datasets
6 *
7 *
8 *
9 *
10 *
11 * \author O. Krause
12 * \date 2012
13 *
14 *
15 * \par Copyright 1995-2017 Shark Development Team
16 *
17 * <BR><HR>
18 * This file is part of Shark.
19 * <https://shark-ml.github.io/Shark/>
20 *
21 * Shark is free software: you can redistribute it and/or modify
22 * it under the terms of the GNU Lesser General Public License as published
23 * by the Free Software Foundation, either version 3 of the License, or
24 * (at your option) any later version.
25 *
26 * Shark is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU Lesser General Public License for more details.
30 *
31 * You should have received a copy of the GNU Lesser General Public License
32 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
33 *
34 */
35//===========================================================================
36
37
38#ifndef SHARK_DATA_DATAVIEW_H
39#define SHARK_DATA_DATAVIEW_H
40
41#include <shark/Data/Dataset.h>
43#include <numeric>
44namespace shark {
45/// \brief Constant time Element-Lookup for Datasets
46///
47/// Datasets are fast for random lookup of batches. Since batch sizes can be arbitrary structured and
48/// changed by the user, there is no way for the Data and LabeledData classes to provide fast random access
49/// to single elements. Still, this property is needed quite often, for example for creating subsets,
50/// randomize data or tree structures.
51/// A View stores the position of every element in a dataset. So it has constant time access to the elements but
52/// it also requires linear memory in the number of elements in the set. This is typically small compared
53/// to the size of the set itself, but construction imposes an considerable overhead.
54///
55/// In contrast to (Un)LabeledData, which is centered around batches, the View is centered around single elements,
56/// so its iterators iterate over the elements.
57/// For a better support for bagging an index method is added which returns the position of the element in the
58/// underlying data container. Also the iterators are indexed and return this index.
59template <class DatasetType> //template parameter can be const!
61{
62public:
63 typedef typename std::remove_const<DatasetType>::type dataset_type; //(non const) type of the underlying dataset
64 typedef typename dataset_type::element_type value_type;
65 typedef typename dataset_type::const_element_reference const_reference;
66 typedef typename dataset_type::batch_type batch_type;
67 // We want to support immutable as well as mutable datasets. So we query whether the dataset
68 // is mutable and change the reference type to const if the dataset is immutable.
69 typedef typename boost::mpl::if_<
70 std::is_const<DatasetType>,
71 typename dataset_type::const_element_reference,
72 typename dataset_type::element_reference
73 >::type reference;
74
75private:
76 typedef typename boost::mpl::if_<
77 std::is_const<DatasetType>,
78 typename dataset_type::const_batch_range,
79 typename dataset_type::batch_range
80 >::type batch_range;
81 template<class Reference, class View>
82 class IteratorBase: public SHARK_ITERATOR_FACADE<
83 IteratorBase<Reference,View>,
84 value_type,
85 std::random_access_iterator_tag,
86 Reference
87 >{
88 public:
89 IteratorBase(){}
90
91 IteratorBase(View& view, std::size_t position)
92 : mpe_view(&view),m_position(position) {}
93
94 template<class R,class V>
95 IteratorBase(IteratorBase<R,V> const& other)
96 : mpe_view(other.mpe_view),m_position(other.position){}
97
98 /// \brief returns the position of the element referenced by the iterator inside the dataset
99 ///
100 /// This is usefull for bagging, when identical elements between several susbsets are to be identified
101 std::size_t index()const{
102 return mpe_view->index(m_position);
103 }
104
105 private:
106 friend class SHARK_ITERATOR_CORE_ACCESS;
107 template <class, class> friend class IteratorBase;
108
109 void increment() {
110 ++m_position;
111 }
112 void decrement() {
113 --m_position;
114 }
115
116 void advance(std::ptrdiff_t n){
117 m_position+=n;
118 }
119
120 template<class R,class V>
121 std::ptrdiff_t distance_to(IteratorBase<R,V> const& other) const{
122 return (std::ptrdiff_t)other.m_position - (std::ptrdiff_t)m_position;
123 }
124
125 template<class R,class V>
126 bool equal(IteratorBase<R,V> const& other) const{
127 return m_position == other.m_position;
128 }
129 Reference dereference() const {
130 return (*mpe_view)[m_position];
131 }
132
133 View* mpe_view;
134 std::size_t m_position;
135 };
136public:
137 typedef IteratorBase<reference,DataView<DatasetType> > iterator;
138 typedef IteratorBase<const_reference, DataView<DatasetType> const > const_iterator;
139
141 DataView(DatasetType& dataset)
142 :m_dataset(dataset),m_indices(dataset.numberOfElements())
143 {
144 std::size_t index = 0;
145 for(std::size_t i = 0; i != dataset.numberOfBatches(); ++i){
146 std::size_t batchSize = Batch<value_type>::size(dataset.batch(i));
147 for(std::size_t j = 0; j != batchSize; ++j,++index){
148 m_indices[index].batch = i;
149 m_indices[index].positionInBatch = j;
150 m_indices[index].datasetIndex = index;
151 }
152 }
153 }
154
155 /// create a subset of the dataset type using only the elemnt indexed by indices
156 template<class IndexRange>
157 DataView(DataView<DatasetType> const& view, IndexRange const& indices)
158 :m_dataset(view.m_dataset),m_indices(indices.size())
159 {
160 for(std::size_t i = 0; i != m_indices.size(); ++i)
161 m_indices[i] = view.m_indices[indices[i]];
162 }
163
164 reference operator[](std::size_t position){
165 SIZE_CHECK(position < size());
166 Index const& index = m_indices[position];
167 return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch);
168 }
169 const_reference operator[](std::size_t position) const{
170 SIZE_CHECK(position < size());
171 Index const& index = m_indices[position];
172 return getBatchElement(m_dataset.batch(index.batch),index.positionInBatch);
173 }
174
176 SIZE_CHECK(size() != 0);
177 return (*this)[0];
178 }
180 SIZE_CHECK(size() != 0);
181 return (*this)[0];
182 }
184 SIZE_CHECK(size() != 0);
185 return (*this)[size()-1];
186 }
188 SIZE_CHECK(size() != 0);
189 return (*this)[size()-1];
190 }
191
192 /// \brief Position of the element in the dataset.
193 ///
194 /// This is useful for bagging, when identical elements among
195 /// several subsets are to be identified.
196 std::size_t index(std::size_t position)const{
197 return m_indices[position].datasetIndex;
198 }
199
200 /// \brief Index of the batch holding the element.
201 std::size_t batch(std::size_t position) const {
202 return m_indices[position].batch;
203 }
204
205 /// \brief Index inside the batch holding the element.
206 std::size_t positionInBatch(std::size_t position) const {
207 return m_indices[position].positionInBatch;
208 }
209
210 std::size_t size() const{
211 return m_indices.size();
212 }
213
215 return iterator(*this, 0);
216 }
218 return const_iterator(*this, 0);
219 }
221 return iterator(*this, size());
222 }
224 return const_iterator(*this, size());
225 }
226
227 dataset_type const& dataset()const{
228 return m_dataset;
229 }
230private:
231 dataset_type m_dataset;
232 // Stores for an element of the dataset, at which position of which batch it is located
233 // as well as the real index of the element inside the dataset
234 struct Index{
235 std::size_t batch;//the batch in which the element is located
236 std::size_t positionInBatch;//at which position in the batch it is
237 std::size_t datasetIndex;//index inside the dataset
238 };
239 std::vector<Index> m_indices;//stores for every element of the set it's position inside the dataset
240};
241
242/**
243 * \ingroup shark_globals
244 *
245 * @{
246 */
247
248/// \brief Creates a subset of a DataView with elements indexed by indices
249///
250/// \param view the view for which the subset is to be created
251/// \param indizes the index of the elements to be stored in the view
252template<class DatasetType,class IndexRange>
253DataView<DatasetType> subset(DataView<DatasetType> const& view, IndexRange const& indizes){
254 //O.K. todo: Remove constructor later on, this is a quick fix.
255 return DataView<DatasetType>(view,indizes);
256}
257
258/// \brief creates a random subset of a DataView with given size
259///
260/// \param view the view for which the subset is to be created
261/// \param size the size of the subset
262template<class DatasetType>
264 std::vector<std::size_t> indices(view.size());
265 std::iota(indices.begin(),indices.end(),0);
266 partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
267 return subset(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
268}
269
270/// \brief Creates a batch given a set of indices
271///
272/// \param view the view from which the batch is to be created
273/// \param indizes the set of indizes defining the batch
274template<class DatasetType,class IndexRange>
275typename DataView<DatasetType>::batch_type subBatch(
276 DataView<DatasetType> const& view,
277 IndexRange const& indizes
278){
279 //create a subset of the view containing the elements of the batch
280 DataView<DatasetType> batchElems = subset(view,indizes);
281
282 //and now use the batch range construction to create it
283 return createBatch(batchElems);
284}
285
286/// \brief Creates a random batch of a given size
287///
288/// \param view the view from which the batch is to be created
289/// \param size the size of the batch
290template<class DatasetType>
291typename DataView<DatasetType>::batch_type randomSubBatch(
292 DataView<DatasetType> const& view,
293 std::size_t size
294){
295 std::vector<std::size_t> indices(view.size());
296 std::iota(indices.begin(),indices.end(),0);
297 partial_shuffle(indices.begin(),indices.begin()+size,indices.end());
298 return subBatch(view,boost::make_iterator_range(indices.begin(),indices.begin()+size));
299}
300
301/// \brief Creates a View from a dataset.
302///
303/// This is just a helper function to omit the actual type of the view
304///
305/// \param set the dataset from which to create the view
306template<class DatasetType>
308 return DataView<DatasetType>(set);
309}
310
311/// \brief Creates a new dataset from a View.
312///
313/// When the elements of a View needs to be processed repeatedly it is often better to use
314/// the packed format of the Dataset again, since then the faster batch processing can be used
315///
316/// \param view the view from which to create the new dataset
317/// \param batchSize the size of the batches in the dataset
318template<class T>
319typename DataView<T>::dataset_type
321 if(view.size() == 0)
322 return typename DataView<T>::dataset_type();
323 //O.K. todo: this is slow for sparse elements, use subBatch or something similar.
324 std::size_t elements = view.size();
325 typename DataView<T>::dataset_type dataset(elements,view[0],batchSize);
326 std::copy(view.begin(),view.end(),dataset.elements().begin());
327 return dataset;
328}
329
330/// Return the number of classes (size of the label vector)
331/// of a classification dataset with RealVector label encoding.
332template <class DatasetType>
333std::size_t numberOfClasses(DataView<DatasetType> const& view){
334 return numberOfClasses(view.dataset());
335}
336
337/// Return the input dimensionality of the labeled dataset represented by the view
338template <class DatasetType>
339std::size_t inputDimension(DataView<DatasetType> const& view){
340 return inputDimension(view.dataset());
341}
342/// Return the label dimensionality of the labeled dataset represented by the view
343template <class DatasetType>
344std::size_t labelDimension(DataView<DatasetType> const& view){
345 return labelDimension(view.dataset());
346}
347
348/// Return the dimensionality of the dataset represented by the view
349template <class DatasetType>
350std::size_t dataDimension(DataView<DatasetType> const& view){
351 return dataDimension(view.dataset());
352}
353
354
355/** @*/
356}
357#endif