45#ifndef SHARK_DATA_DATASET_H
46#define SHARK_DATA_DATASET_H
48#include <boost/range/iterator_range.hpp>
53#include <boost/iterator/transform_iterator.hpp>
56#include "Impl/Dataset.inl"
141 typedef typename Container::BatchType batch_type;
145 typedef Type element_type;
158 return (! (*
this == rhs));
164 typedef boost::iterator_range< detail::DataElementIterator<Data<Type> > >
element_range;
176 detail::DataElementIterator<
Data<Type> const>(
this,0,0,0),
186 detail::DataElementIterator<
Data<Type> >(
this,0,0,0),
212 return m_data.numberOfElements();
233 return *(detail::DataElementIterator<Data<Type> >(
this,0,0,0)+i);
236 return *(detail::DataElementIterator<Data<Type>
const>(
this,0,0,0)+i);
241 return *(
m_data.begin()+i);
244 return *(
m_data.begin()+i);
317 template<
class Range>
319 m_data.repartition(batchSizes);
327 return m_data.getPartitioning();
336 template<
class Range>
341 std::vector<Type> batch_elements;
342 auto indexPos = indices.begin();
343 auto elemBegin =
elements().begin();
346 batch_elements.clear();
347 for(std::size_t i = 0; i != numElements; ++i,++indexPos){
348 batch_elements.push_back(*(elemBegin+*indexPos));
350 dataCopy.
batch(b) = createBatch<Type>(batch_elements);
360 detail::complement(indices,
m_data.size(),comp);
387 stream << elem <<
"\n";
397template <
class InputT>
401 typedef InputT element_type;
475 std::iota(indices.begin(),indices.end(),0);
494template <
class InputT,
class LabelT>
507 typedef InputLabelBatch<
512 typedef InputLabelPair<InputType,LabelType> element_type;
515 typedef InputLabelBatch<
519 typedef InputLabelBatch<
527 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType> > >
element_range;
528 typedef boost::iterator_range< detail::DataElementIterator<LabeledData<InputType,LabelType>
const> >
const_element_range;
529 typedef detail::BatchRange<LabeledData<InputType,LabelType> >
batch_range;
641 return *(detail::DataElementIterator<LabeledData<InputType,LabelType> >(
this,0,0,0)+i);
644 return *(detail::DataElementIterator<LabeledData<InputType,LabelType>
const>(
this,0,0,0)+i);
736 template<
class Range>
755 template<
class Range>
764 std::iota(indices.begin(),indices.end(),0);
790template<
class Functor,
class T>
798 static Shape infer(T
const&){
return {};}
802struct InferShape<Data<blas::vector<T> > >{
803 static Shape infer(Data<blas::vector<T> >
const& f){
804 return {f.element(0).
size()};
809struct InferShape<Data<blas::compressed_vector<T> > >{
810 static Shape infer(Data<blas::compressed_vector<T> >
const& f){
811 return {f.element(0).size()};
824Data<typename Range::value_type>
826 typedef typename Range::value_type Input;
828 if (maximumBatchSize == 0)
831 std::size_t numPoints = inputs.size();
833 std::size_t batches = numPoints / maximumBatchSize;
834 if(numPoints > batches*maximumBatchSize)
836 std::size_t optimalBatchSize=numPoints/batches;
837 std::size_t remainder = numPoints-batches*optimalBatchSize;
841 auto start= inputs.begin();
842 for(std::size_t i = 0; i != batches; ++i){
843 std::size_t size = (i<remainder)?optimalBatchSize+1:optimalBatchSize;
844 auto end = start+size;
845 data.
batch(i) = createBatch<Input>(
846 boost::make_iterator_range(start,end)
850 data.
shape() = detail::InferShape<Data<Input> >::infer(data);
856UnlabeledData<typename boost::range_value<Range>::type>
861template<
class Range1,
class Range2>
863 typename boost::range_value<Range1>::type,
864 typename boost::range_value<Range2>::type
867 SHARK_RUNTIME_CHECK(inputs.size() == labels.size(),
"Number of inputs and number of labels must agree");
868 typedef typename boost::range_value<Range1>::type Input;
869 typedef typename boost::range_value<Range2>::type Label;
871 if (maximumBatchSize == 0)
881template<
class T,
class U>
884 stream << elem.input <<
" [" << elem.label <<
"]"<<
"\n";
894 unsigned int classes = 0;
896 classes = std::max(classes,*std::max_element(labels.
batch(i).begin(),labels.
batch(i).end()));
905 for(
unsigned int elem: labels.
batch(i)){
913template <
class InputType>
916 return dataset.
element(0).size();
920template <
class InputType,
class LabelType>
926template <
class InputType,
class LabelType>
931template <
class InputType>
936template<
class InputType,
class LabelType>
945template<
class T,
class Functor>
946typename boost::lazy_disable_if<
947 CanBeCalled<Functor,typename Data<T>::batch_type>,
948 TransformedData<Functor,T>
951 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
955 result.
batch(i)= createBatch<ResultType>(
959 result.
shape() = detail::InferShape<Data<ResultType> >::infer(result);
966template<
class T,
class Functor>
967typename boost::lazy_enable_if<
968 CanBeCalled<Functor,typename Data<T>::batch_type>,
969 TransformedData<Functor,T>
972 typedef typename detail::TransformedDataElement<Functor,T>::type ResultType;
977 Shape shape = detail::InferShape<Functor>::infer(f);
978 if(shape ==
Shape()){
979 shape = detail::InferShape<Data<ResultType> >::infer(result);
981 result.
shape() = shape;
986template<
class I,
class L,
class Functor>
987LabeledData<typename detail::TransformedDataElement<Functor,I >::type, L >
993template<
class I,
class L,
class Functor>
994LabeledData<I,typename detail::TransformedDataElement<Functor,L >::type >
1001template<
class T,
class FeatureSet>
1003 auto select = [&](blas::matrix<T>
const& input){
1004 blas::matrix<T> output(input.size1(),features.size());
1005 for(std::size_t i = 0; i != input.size1(); ++i){
1006 for(std::size_t j = 0; j != features.size(); ++j){
1007 output(i,j) = input(i,features[j]);
1015template<
class T,
class FeatureSet>
1028template<
class DatasetT>
1030 SIZE_CHECK(elementIndex<=data.numberOfElements());
1032 std::size_t batchPos = 0;
1033 std::size_t batchStart = 0;
1034 while(batchStart +
batchSize(data.batch(batchPos)) < elementIndex){
1035 batchStart +=
batchSize(data.batch(batchPos));
1038 std::size_t splitPoint = elementIndex-batchStart;
1039 if(splitPoint != 0){
1040 data.splitBatch(batchPos,splitPoint);
1044 return data.splice(batchPos);
1054 std::vector<std::size_t > classCounts =
classSizes(data);
1055 std::vector<std::size_t > partitioning;
1056 std::vector<std::size_t > classStart;
1057 detail::batchPartitioning(classCounts, classStart, partitioning,
batchSize);
1061 std::vector<std::size_t> classIndex(classCounts.size(),0);
1062 for(std::size_t i = 1; i != classIndex.size();++i){
1063 classIndex[i] = classIndex[i-1] + classCounts[i-1];
1066 std::size_t index = 0;
1067 for (
auto const& elem: data.
elements()){
1068 std::size_t c = elem.label;
1069 elemIndex[classIndex[c] ] = index;
1079 unsigned int zeroClass,
1080 unsigned int oneClass
1082 std::vector<std::size_t> indexSet;
1083 std::size_t smaller = std::min(zeroClass,oneClass);
1084 std::size_t bigger = std::max(zeroClass,oneClass);
1088 std::size_t start= 0;
1094 indexSet.push_back(start);
1103 indexSet.push_back(start);
1119 unsigned int oneClass
1121 return transformLabels(data, [=](
unsigned int label){
return (
unsigned int)(label == oneClass);});
1124template <
typename RowType>
1128 std::size_t rowCounter = 0;
1129 for(
auto element: data.
elements()){
1130 column(rowCounter) = element(columnID);
1136template <
typename RowType>
1140 std::size_t rowCounter = 0;
1141 for(
auto element: data.
elements()){
1142 element(columnID) = newColumn(rowCounter);