Subsets.cpp
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Data Subsets
6 *
7 * This file is part of the tutorial "Creating and Using Subsets of Data".
8 * By itself, it does not do anything particularly useful.
9 *
10 * \author T. Glasmachers
11 * \date 2014
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36#include <shark/Data/Dataset.h>
37using namespace shark;
38
40
41#include <shark/Data/DataView.h>
42
43
44int main()
45{
46 typedef RealVector I;
47 typedef unsigned int L;
48
49 std::size_t start = 0, end = 42;
50
51{
52 LabeledData<I,L> dataset; // our dataset
53
54 // create an indexed subset of batches
55 std::vector<std::size_t> indices; // indices of the batches to be contained in the subset
56 LabeledData<I,L> subset = dataset.indexedSubset(indices);
57
58 unsigned int k = 7;
59 LabeledData<I,L> remaining_batches = dataset.splice(k);
60
61 LabeledData<I,L> remaining_elements = splitAtElement(dataset, k);
62}
63
65 // ...
67
68 std::size_t class0 = 0, class1 = 1;
69 ClassificationDataset subproblem = binarySubProblem(data, class0, class1);
70
71{
73
74 // creating a random subset from indices
75 std::size_t k = 100;
76 std::vector<std::size_t> indices(view.size());
77 for (std::size_t i=0; i<view.size(); i++) indices[i] = i;
78 for (std::size_t i=0; i<k; i++) std::swap(indices[i], indices[rand() % view.size()]);
79 indices.resize(k);
80 DataView<ClassificationDataset> subset1 = subset(view, indices);
81
82 // same functionality in one line
84}
85
86{
87 std::size_t numberOfPartitions = 5;
88 std::vector<std::size_t> indices;
89 // Creates partitions of approximately the same size.
90 createCVSameSize(data, numberOfPartitions);
91
92 // Creates IID drawn partitions of the data set (without replacement).
93 createCVIID(data, numberOfPartitions);
94
95 // Creates indexed cross-validation sets. For each element the
96 // index describes the fold in which the data point acts as a
97 // validation example. This function offers maximal control.
98 createCVIndexed(data, numberOfPartitions, indices);
99
100 createCVSameSizeBalanced(data, numberOfPartitions);
101
102}
103
104{
105 std::size_t numberOfPartitions = 5;
106 std::size_t numberOfFolds = 3;
108
109 for (std::size_t i=0; i<numberOfPartitions; i++)
110 {
111 // as created in the above example
112 RegressionDataset training = folds.training(i);
113 RegressionDataset validation = folds.validation(i);
114 // explicit copy!
115 training.makeIndependent();
116 // creating a new fold
117 CVFolds<RegressionDataset> innerFolds = createCVSameSize(training, numberOfFolds);
118 }
119}
120
121}