DataDistribution.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Learning problems given by analytic distributions.
6 *
7 *
8 *
9 *
10 * \author T. Glasmachers
11 * \date 2006-2013
12 *
13 *
14 * \par Copyright 1995-2017 Shark Development Team
15 *
16 * <BR><HR>
17 * This file is part of Shark.
18 * <https://shark-ml.github.io/Shark/>
19 *
20 * Shark is free software: you can redistribute it and/or modify
21 * it under the terms of the GNU Lesser General Public License as published
22 * by the Free Software Foundation, either version 3 of the License, or
23 * (at your option) any later version.
24 *
25 * Shark is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 * GNU Lesser General Public License for more details.
29 *
30 * You should have received a copy of the GNU Lesser General Public License
31 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
32 *
33 */
34//===========================================================================
35
36
37#ifndef SHARK_DATA_DATADISTRIBUTION_H
38#define SHARK_DATA_DATADISTRIBUTION_H
39
40#include <shark/Data/Dataset.h>
41#include <shark/Core/Random.h>
43#include <utility>
44
45namespace shark {
46
47
48///
49/// \brief A DataDistribution defines an unsupervised learning problem.
50///
51/// \par
52/// The unsupervised learning problem is defined by an explicit
53/// distribution (in contrast to a finite dataset). The only
54/// method we need is to draw a sample from the distribution.
55///
56template <class InputType>
58{
59public:
60 /// \brief Virtual destructor.
61 virtual ~DataDistribution() { }
62
63 /// \brief Generates a single pair of input and label.
64 ///
65 /// @param input the generated input
66 virtual void draw(InputType& input) const = 0;
67
68 // \brief Interface for std::generate.
70 InputType ret;
71 draw(ret);
72 return ret;
73 }
74
75 /// \brief Generates a data set with samples from from the distribution.
76 ///
77 /// @param size the number of samples in the dataset
78 /// @param maximumBatchSize the maximum size of a batch
79 UnlabeledData<InputType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const {
80 std::vector<InputType> data(size);
81
82 // draw the samples
83 for (std::size_t i = 0; i < size; ++i){
84 draw(data[i]);
85 }
86 //create dataset
87 return createUnlabeledDataFromRange(data,maximumBatchSize);
88 }
89
90 /// \brief Generates a data set with samples from from the distribution.
91 ///
92 /// @param size the number of samples in the dataset
96};
97
98
99///
100/// \brief A LabeledDataDistribution defines a supervised learning problem.
101///
102/// \par
103/// The supervised learning problem is defined by an explicit
104/// distribution (in contrast to a finite dataset). The only
105/// method we need is to draw a sample from the distribution.
106///
107template <class InputType, class LabelType>
109{
110public:
111 /// \brief Virtual destructor.
113
114 /// \brief Generates a single pair of input and label.
115 /// @param input the generated input
116 /// @param label the generated label
117 virtual void draw(InputType& input, LabelType& label) const = 0;
118
119 // \Brief Interface for std::generate.
120 std::pair<InputType,LabelType> operator() () {
121 std::pair<InputType,LabelType> ret;
122 draw(ret.first,ret.second);
123 return ret;
124 }
125
126 /// \brief Generates a dataset with samples from from the distribution.
127 ///
128 /// @param size the number of samples in the dataset
129 /// @param maximumBatchSize the maximum size of a batch
130 LabeledData<InputType, LabelType> generateDataset(std::size_t size,std::size_t maximumBatchSize) const{
131 std::vector<InputType> inputs(size);
132 std::vector<LabelType> labels(size);
133
134 // draw the samples
135 for (std::size_t i = 0; i < size; ++i){
136 draw(inputs[i], labels[i]);
137 }
138 //create dataset
139 return createLabeledDataFromRange(inputs,labels,maximumBatchSize);
140 }
141
142 /// \brief Generates a data set with samples from from the distribution.
143 ///
144 /// @param size the number of samples in the dataset
148};
149
150
151///
152/// \brief "chess board" problem for binary classification
153///
154class Chessboard : public LabeledDataDistribution<RealVector, unsigned int>
155{
156public:
157 Chessboard(unsigned int size = 4, double noiselevel = 0.0)
158 {
159 m_size = size;
160 m_noiselevel = noiselevel;
161 }
162
163
164 void draw(RealVector& input, unsigned int& label)const{
165 input.resize(2);
166 unsigned int j, t = 0;
167 for (j = 0; j < 2; j++)
168 {
169 double v = random::uni(random::globalRng, 0.0, (double)m_size);
170 t += (int)floor(v);
171 input(j) = v;
172 }
173 label = (t & 1);
174 if (random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel) label = 1 - label;
175 }
176
177protected:
178 unsigned int m_size;
180};
181
182
183///
184/// \brief Noisy sinc function: y = sin(x) / x + noise
185///
186class Wave : public LabeledDataDistribution<RealVector, RealVector>
187{
188public:
189 Wave(double stddev = 0.1, double range = 5.0){
190 m_stddev = stddev;
191 m_range = range;
192 }
193
194
195 void draw(RealVector& input, RealVector& label)const{
196 input.resize(1);
197 label.resize(1);
199 if(input(0) != 0)
200 label(0) = sin(input(0)) / input(0) + random::gauss(random::globalRng, 0.0, m_stddev);
201 else
202 label(0) = random::gauss(random::globalRng, 0.0, m_stddev);
203 }
204
205protected:
206 double m_stddev;
207 double m_range;
208};
209
210
211
212/// "Pami Toy" problem for binary classification, as used in the article "Glasmachers
213/// and C. Igel. Maximum Likelihood Model Selection for 1-Norm Soft Margin SVMs with Multiple
214/// Parameters. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2010."
215/// In summary, the first M dimensions are correlated to the labels, the last N dimensions
216/// are not.
217class PamiToy : public LabeledDataDistribution<RealVector, unsigned int>
218{
219public:
220 PamiToy(unsigned int size_useful = 5, unsigned int size_noise = 5, double noise_position = 0.0, double noise_variance = 1.0 )
221 : m_size( size_useful+size_noise ),
222 m_sizeUseful( size_useful ),
223 m_sizeNoise( size_noise ),
224 m_noisePos( noise_position) ,
225 m_noiseVar( noise_variance )
226 { }
227
228 void draw(RealVector& input, unsigned int& label)const{
229 input.resize( m_size );
230 label = (unsigned int) random::discrete(random::globalRng, 0,1); //fix label first
231 double y2 = label - 0.5; //"clean" informative feature values
232 // now fill the informative features..
233 for ( unsigned int i=0; i<m_sizeUseful; i++ ) {
235 }
236 // ..and the uninformative ones
237 for ( unsigned int i=m_sizeUseful; i<m_size; i++ ) {
239 }
240 }
241
242protected:
243 unsigned int m_size;
244 unsigned int m_sizeUseful;
245 unsigned int m_sizeNoise;
248};
249
250/// This class randomly fills a (hyper-)square with data points. Points which
251/// happen to be within a (hyper-)circle centered in the square of a certain
252/// radius get a positive class label. Noise on the labels can be added.
253class CircleInSquare : public LabeledDataDistribution<RealVector, unsigned int>
254{
255public:
256 CircleInSquare( unsigned int dimensions = 2, double noiselevel = 0.0, bool class_prob_equal = false )
257 : m_dimensions( dimensions ),
258 m_noiselevel( noiselevel ),
259 m_lowerLimit( -1 ),
260 m_upperLimit( 1 ),
261 m_centerpoint( 0 ),
262 m_inner_radius2( 0.5*0.5 ),
263 m_outer_radius2( 0.5*0.5 ),
264 m_equal_class_prob( class_prob_equal )
265 { }
266
267 /// allow for arbitrary box limits
268 void setLimits( double lower_limit, double upper_limit, double inner_radius, double outer_radius )
269 {
270 RANGE_CHECK( lower_limit < upper_limit );
271 RANGE_CHECK( inner_radius <= outer_radius );
272 RANGE_CHECK( 2*outer_radius <= upper_limit-lower_limit );
273 m_lowerLimit = lower_limit;
274 m_upperLimit = upper_limit;
275 m_centerpoint = (upper_limit-lower_limit)/2.0;
276 m_inner_radius2 = inner_radius*inner_radius;
277 m_outer_radius2 = outer_radius*outer_radius;
278 }
279
280 void draw(RealVector& input, unsigned int& label)const
281 {
282 input.resize( m_dimensions );
283 double v, dist;
284
285 if ( m_equal_class_prob ) { //each class has equal probability - this implementation is brute-force and gorgeously inefficient :/
286 bool this_label = random::coinToss(random::globalRng);
287 label = ( this_label ? 1 : 0 );
288 if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
289 label = 1 - label;
290 if ( this_label ) {
291 do {
292 dist = 0.0;
293 for ( unsigned int i=0; i<m_dimensions; i++ ) {
295 input(i) = v;
296 dist += (v-m_centerpoint)*(v-m_centerpoint);
297 }
298 } while( dist > m_inner_radius2 );
299 }
300 else {
301 do {
302 dist = 0.0;
303 for ( unsigned int i=0; i<m_dimensions; i++ ) {
305 input(i) = v;
306 dist += (v-m_centerpoint)*(v-m_centerpoint);
307 }
308 } while( dist < m_outer_radius2 );
309 }
310 }
311 else { //equal probability to be anywhere in the cube
312 do {
313 dist = 0.0;
314 for ( unsigned int i=0; i<m_dimensions; i++ ) {
316 input(i) = v;
317 dist += (v-m_centerpoint)*(v-m_centerpoint);
318 }
319 label = ( dist < m_inner_radius2 ? 1 : 0 );
320 if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
321 label = 1 - label;
322 } while( dist > m_inner_radius2 && dist < m_outer_radius2 );
323 }
324 }
325
326protected:
327 unsigned int m_dimensions;
334 bool m_equal_class_prob; ///<if true, the probability to belong to either class is equal. if false, it is uniform over the cube.
335};
336
337// This class randomly fills a 4x4 square in the 2D-plane with data points.
338// Points in the lower left diagonal half are negative, points in the
339// upper right diagonal half are positive. But additionally, all points
340// in a circle located in the lower right quadrant are positive, effectively
341// bulging the decision boundary inward. Noise on the labels can be added.
342class DiagonalWithCircle : public LabeledDataDistribution<RealVector, unsigned int>
343{
344public:
345 DiagonalWithCircle( double radius = 1.0, double noise = 0.0 )
346 : m_radius2( radius*radius ),
348 { }
349
350 void draw(RealVector& input, unsigned int& label)const
351 {
352 input.resize( 2 );
353 double x,y;
354 x = random::uni(random::globalRng, 0, 4 ); //zero is left
355 y = random::uni(random::globalRng, 0, 4 ); //zero is bottom
356 // assign label according to position w.r.t. the diagonal
357 if ( x+y < 4 )
358 label = 1;
359 else
360 label = 0;
361 // but if in the circle (even above diagonal), assign positive label
362 if ( (3-x)*(3-x) + (1-y)*(1-y) < m_radius2 )
363 label = 1;
364
365 // add noise
366 if ( random::uni(random::globalRng, 0.0, 1.0) < m_noiselevel )
367 label = 1 - label;
368 input(0) = x;
369 input(1) = y;
370 }
371
372protected:
373 double m_radius2;
375};
376
377
378/// \brief Generates a set of normally distributed points
380{
381public:
382 /// \brief Generates a simple distribution with
383 NormalDistributedPoints(std::size_t dim): m_offset(dim,0){
384 RealMatrix covariance(dim,dim,0);
385 diag(covariance) = blas::repeat(1.0,dim);
387 }
388 NormalDistributedPoints(RealMatrix const& covariance, RealVector const& offset)
389 :m_dist(covariance), m_offset(offset){
390 SIZE_CHECK(offset.size() == covariance.size1());
391 }
392 void draw(RealVector& input) const{
393 input.resize(m_offset.size());
394 noalias(input) = m_offset;
395 noalias(input) += m_dist(random::globalRng).first;
396 }
397private:
399 RealVector m_offset;
400};
401
402/// \brief Given a set of images, draws a set of image patches of a given size
403class ImagePatches:public DataDistribution<RealVector>{
404public:
406 Data<RealVector> images,
407 std::size_t imageWidth, std::size_t imageHeight,
408 std::size_t patchWidth, std::size_t patchHeight
409 ):m_images(images)
410 , m_imageWidth(imageWidth)
411 , m_imageHeight(imageHeight)
412 , m_patchWidth(patchWidth)
413 , m_patchHeight(patchHeight)
414 ,m_numImages(m_images.numberOfElements()){}
415
416 void draw(RealVector& input) const{
417 //sample image
418 std::size_t imageNum = random::discrete(random::globalRng, std::size_t(0),m_numImages-1);
419 Data<RealVector>::const_element_reference image = m_images.element(imageNum);
420 //draw the upper left corner of the image
421 std::size_t m_startX = random::discrete(random::globalRng, std::size_t(0),m_imageWidth-m_patchWidth);
422 std::size_t m_startY = random::discrete(random::globalRng, std::size_t(0),m_imageHeight-m_patchHeight);
423
424
425 //copy patch
426 input.resize(m_patchWidth * m_patchHeight);
427 std::size_t rowStart = m_startY * m_imageWidth + m_startX;
428 for (size_t y = 0; y < m_patchHeight; ++y){
429 for (size_t x = 0; x < m_patchWidth; ++x){
430 input(y * m_patchWidth + x) = image(rowStart+x);
431 }
432 rowStart += m_imageWidth;
433 }
434 }
435private:
436 Data<RealVector> m_images;
437 std::size_t m_imageWidth;
438 std::size_t m_imageHeight;
439 std::size_t m_patchWidth;
440 std::size_t m_patchHeight;
441 std::size_t m_numImages;
442};
443
444}
445#endif