Statistics.h
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Calculate statistics given a range of values.
5 *
6 *
7 *
8 * \author O.Krause
9 * \date 2015
10 *
11 *
12 * \par Copyright 1995-2017 Shark Development Team
13 *
14 * <BR><HR>
15 * This file is part of Shark.
16 * <https://shark-ml.github.io/Shark/>
17 *
18 * Shark is free software: you can redistribute it and/or modify
19 * it under the terms of the GNU Lesser General Public License as published
20 * by the Free Software Foundation, either version 3 of the License, or
21 * (at your option) any later version.
22 *
23 * Shark is distributed in the hope that it will be useful,
24 * but WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 * GNU Lesser General Public License for more details.
27 *
28 * You should have received a copy of the GNU Lesser General Public License
29 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
30 *
31 */
32#ifndef SHARK_STATISTICS_H
33#define SHARK_STATISTICS_H
34
35
36//for vector algebra
37#include <shark/LinAlg/Base.h>
38
39//handling of missing values
40#include <limits>
41#include <boost/math/special_functions/fpclassify.hpp>
42
43//for quantiles
44#include <boost/range/algorithm/nth_element.hpp>
45
46
47//for the result table
48#include <string>
49#include <map>
50#include <iterator>
51#include <boost/serialization/string.hpp>
52#include <boost/serialization/map.hpp>
53#include <boost/serialization/vector.hpp>
54
55namespace shark {
56namespace statistics{
57
58inline double missingValue(){
59 return std::numeric_limits<double>::quiet_NaN();//missing values are a non-signaling NaN
60}
61
62inline bool isMissing(double value){
63 return boost::math::isnan(value);//there is no portable way to distinguish the different types of NaN
64}
65
66///\brief Base class for all Statistic Objects to be used with Statistics
68public:
69 virtual std::string name() const=0;
71 virtual RealVector statistics(std::vector<RealVector> const& points)const=0;
72};
73
74///\brief for a vector of points computes for every dimension the fraction of missing values
76public:
77 std::string name() const{
78 return "Missing";
79 }
80 RealVector statistics(std::vector<RealVector> const& points)const{
81 std::size_t N = points.size();
82 RealVector missing(points[0].size(),0.0);
83 for(std::size_t i = 0; i != N;++i){
84 for(std::size_t j = 0; j != missing.size(); ++j){
85 if(!isMissing(points[i](j)))continue;
86 missing(j) += 1.0;
87 }
88 }
89 missing /= N;
90 return missing;
91 }
92};
93
94///\brief For a vector of points computes for every dimension the mean
96public:
97 std::string name() const{
98 return "Mean";
99 }
100 RealVector statistics(std::vector<RealVector> const& points)const{
101 std::size_t N = points.size();
102 RealVector sum(points[0].size(),0.0);
103 UIntVector numSamples(points[0].size(),0);
104 for(std::size_t i = 0; i != N;++i){
105 for(std::size_t j = 0; j != sum.size(); ++j){
106 if(isMissing(points[i](j)))continue;
107 sum(j) += points[i](j);
108 ++numSamples(j);
109 }
110 }
111 //calculate mean. if the number of non-missing points was 0, return missingValue() for that dimension
112 return safe_div(sum,numSamples,missingValue());
113 }
114};
115
116///\brief For a vector of points computes for every dimension the variance
118public:
119 std::string name() const{
120 return "Variance";
121 }
122 RealVector statistics(std::vector<RealVector> const& points)const{
123 std::size_t N = points.size();
124 Mean m;
125 RealVector mean = m.statistics(points);
126 RealVector variance(mean.size(),0.0);
127 UIntVector numSamples(points[0].size(),0);
128 for(std::size_t i = 0; i != N;++i){
129 for(std::size_t j = 0; j != mean.size(); ++j){
130 if(isMissing(points[i](j)))continue;
131 variance(j) += sqr(points[i](j)-mean(j));
132 ++numSamples(j);
133 }
134 }
135 //calculate biased variance. if the number of non-missing points was 0, return missingValue() for that dimension
136 return safe_div(variance,numSamples,missingValue());
137 }
138};
139
140//Quantiles, Median, Lower-Upper
141///\brief For a vector of points computes for every dimension the p-quantile
143public:
144 std::string name() const{
145 return boost::lexical_cast<std::string>(m_quantile)+"-Quantile";
146 }
147 Quantile(double quantile):m_quantile(quantile){}
148 RealVector statistics(std::vector<RealVector> const& points)const{
149 std::size_t N = points.size();
150 RealVector quantiles(points[0].size(),missingValue());
151 for(std::size_t j = 0; j != quantiles.size(); ++j){
152 //get all non-missing values of the j-th dimension
153 std::vector<double> values;
154 for(std::size_t i = 0; i != N;++i){
155 if(isMissing(points[i](j)))continue;
156 values.push_back(points[i](j));
157 }
158 if(values.size() == 0) continue;//no values-> missing value
159
160 //compute quantile of j-th dimension
161 std::size_t element = std::size_t(values.size()*m_quantile);
162 std::vector<double>::iterator pos= values.begin()+element;
163 boost::nth_element(values,pos);
164 quantiles(j) = *pos;
165 }
166 return quantiles;
167 }
168private:
169 double m_quantile;
170};
171
172///\brief For a vector of points computes for every dimension the median
173class Median:public Quantile{
174public:
175 std::string name() const{
176 return "Median";
177 }
179};
180
181///\brief For a vector of points computes for every dimension the 25%-quantile
183public:
185};
186///\brief For a vector of points computes for every dimension the 75%-quantile
188public:
190};
191
192
193///\brief Stores results of a running experiment
194///
195/// This is a simple three dimensional table with the dimensions. Experiments
196/// are thought of having a varied parameter (for example the algorithm names when
197/// several algorithms are compared) and for each parameter a set of vector valued points
198/// is stored - one vector for each trial of the experiment for a given parameter.
199/// It is posible to give every parameter and the whole table a name which adds meta
200/// information, for example to generate outputs.
201template<class Parameter>
203public:
204 typedef typename std::map<Parameter, std::vector<RealVector> >::const_iterator const_iterator;
205
206 ResultTable(std::size_t numDimensions, std::string const& parameterName="unnamed")
207 :m_dimensionNames(numDimensions,"unnamed"),m_parameterName(parameterName){}
208
209 std::string const& parameterName()const{
210 return m_parameterName;
211 }
212
213 void setDimensionName(std::size_t i, std::string const& name){
214 m_dimensionNames[i]=name;
215 }
216
217 std::string const& dimensionName(std::size_t i)const{
218 return m_dimensionNames[i];
219 }
220
221 std::size_t numDimensions()const{
222 return m_dimensionNames.size();
223 }
224
225 void update(Parameter const& parameter, RealVector const& point){
226 SIZE_CHECK(point.size() == numDimensions());
227 m_results[parameter].push_back(point);
228 }
229
230 void update(Parameter const& parameter, double value){
231 RealVector point(1,value);
232 update(parameter, point);
233 }
234
235 void update(Parameter const& parameter, double value1, double value2){
236 RealVector point(2);
237 point(0)=value1;
238 point(1)=value2;
239 update(parameter, point);
240 }
241
242 void update(Parameter const& parameter, double value1, double value2,double value3){
243 RealVector point(3);
244 point(0)=value1;
245 point(1)=value2;
246 point(2)=value3;
247 update(parameter, point);
248 }
249
250 std::vector<RealVector>const& operator[](Parameter const& param)const{
251 return m_results.find(param)->second;
252 }
253
255 return m_results.begin();
256 }
258 return m_results.end();
259 }
260
261 std::size_t numParams()const{
262 return m_results.size();
263 }
264
265 Parameter const& parameterValue(std::size_t i)const{
266 const_iterator pos = begin();
267 std::advance(pos,i);
268 return pos->first;
269 }
270
271
272 template<class Archive>
273 void serialize(Archive &ar, const unsigned int file_version) {
274 ar & m_dimensionNames;
275 ar & m_parameterName;
276 ar & m_results;
277 (void) file_version;//prevent warning
278 }
279
280private:
281 std::vector<std::string> m_dimensionNames;
282 std::string m_parameterName;
283 std::map<Parameter, std::vector<RealVector> > m_results;
284};
285
286///\brief Generates Statistics over the results of an experiment
287///
288/// Given the results of an experiment stored in a ResultsTable, computes
289/// several tatistics for each variable.
290template<class Parameter>
292public:
293 typedef typename std::map<Parameter, std::map<std::string,RealVector> >::const_iterator const_iterator;
294 Statistics(ResultTable<Parameter> const* table):m_resultsTable(table){}
295
296 void addStatistic(std::string const& statisticName, BaseStatisticsObject const& object){
297 typedef typename ResultTable<Parameter>::const_iterator iterator;
298 iterator end = m_resultsTable->end();
299 for(iterator pos=m_resultsTable->begin(); pos != end; ++pos){
300 m_statistics[pos->first][statisticName] = object.statistics(pos->second);
301 }
302 m_statisticNames.push_back(statisticName);
303 }
304
306 addStatistic(object.name(),object);
307 }
308
309 std::map<std::string,RealVector> const& operator[](Parameter const& parameter)const{
310 return m_statistics.find(parameter)->second;
311 }
312
314 return m_statistics.begin();
315 }
317 return m_statistics.end();
318 }
319
320 //information about the parameter of the experiments
321 std::string const& parameterName()const{
322 return m_resultsTable->parameterName();
323 }
324
325 std::size_t numParams()const{
326 return m_resultsTable->numParams();
327 }
328
329 Parameter const& parameterValue(std::size_t i)const{
330 return m_resultsTable->parameterValue(i);
331 }
332
333 //information about the names of the dimensions
334 std::size_t numDimensions()const{
335 return m_resultsTable->numDimensions();
336 }
337
338 std::string const& dimensionName(std::size_t i)const{
339 return m_resultsTable->dimensionName(i);
340 }
341
342 //information about the statistics
343 std::size_t numStatistics()const{
344 return m_statisticNames.size();
345 }
346 std::string const& statisticName(std::size_t i)const{
347 return m_statisticNames[i];
348 }
349private:
350 std::vector<std::string> m_statisticNames;
351 ResultTable<Parameter> const* m_resultsTable;
352 std::map<Parameter, std::map<std::string,RealVector> > m_statistics;
353};
354
355template<class Parameter>
356void printCSV(Statistics<Parameter> const& statistics){
357 //first print a legend
358 std::cout<<"# "<<statistics.parameterName();
359 for(std::size_t i = 0; i != statistics.numStatistics(); ++i){
360 for(std::size_t j = 0; j != statistics.numDimensions(); ++j){
361 std::cout<<" "<<statistics.statisticName(i)<<"-"<<statistics.dimensionName(j);
362 }
363 }
364 std::cout<<"\n";
365
366 //print results parameter by parameter
367 for(std::size_t k = 0; k != statistics.numParams(); ++k){
368 Parameter param=statistics.parameterValue(k);
369 std::map<std::string,RealVector> paramResults=statistics[param];
370 std::cout<<param;
371 for(std::size_t i = 0; i != statistics.numStatistics(); ++i){
372 for(std::size_t j = 0; j != statistics.numDimensions(); ++j){
373 std::cout<<" "<<paramResults[statistics.statisticName(i)](j);
374 }
375 }
376 std::cout<<"\n";
377 }
378}
379
380}}
381#endif // SHARK_STATISTICS_H