Shark machine learning library
Installation
Tutorials
Benchmarks
Documentation
Quick references
Class list
Global functions
include
shark
Data
Download.h
Go to the documentation of this file.
1
//===========================================================================
2
/*!
3
*
4
*
5
* \brief Support for downloading data sets from online sources.
6
*
7
*
8
* \par
9
* The methods in this file allow to download data sets from the
10
* mldata.org repository and other sources.
11
*
12
*
13
*
14
*
15
* \author T. Glasmachers
16
* \date 2016-2018
17
*
18
*
19
* \par Copyright 1995-2018 Shark Development Team
20
*
21
* <BR><HR>
22
* This file is part of Shark.
23
* <https://shark-ml.github.io/Shark/>
24
*
25
* Shark is free software: you can redistribute it and/or modify
26
* it under the terms of the GNU Lesser General Public License as published
27
* by the Free Software Foundation, either version 3 of the License, or
28
* (at your option) any later version.
29
*
30
* Shark is distributed in the hope that it will be useful,
31
* but WITHOUT ANY WARRANTY; without even the implied warranty of
32
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33
* GNU Lesser General Public License for more details.
34
*
35
* You should have received a copy of the GNU Lesser General Public License
36
* along with Shark. If not, see <http://www.gnu.org/licenses/>.
37
*
38
*/
39
//===========================================================================
40
41
#ifndef SHARK_DATA_DOWNLOAD_H
42
#define SHARK_DATA_DOWNLOAD_H
43
44
#include <
shark/Core/DLLSupport.h
>
45
#include <
shark/Data/Dataset.h
>
46
#include <
shark/Data/SparseData.h
>
47
#include <
shark/Data/Csv.h
>
48
#include <sstream>
49
50
namespace
shark
{
51
52
/**
53
* \ingroup shark_globals
54
*
55
* @{
56
*/
57
58
59
/// \brief Split a URL into its domain and resource parts.
60
///
61
/// Returns a std::tuple where the first element indicates whether a
62
/// secure socket is used, the second one is the domain, and the third
63
/// is the resource. With std::tie you can do pattern-matching:
64
/// std::tie(https, domain, resource) = splitUrl(url);
65
/// will fill the variables https, domain and resource.
66
SHARK_EXPORT_SYMBOL
std::tuple<bool, std::string, std::string>
splitUrl
(std::string
const
& url);
67
68
/// \brief Download a document with the HTTP protocol.
69
///
70
/// \param url download URL, for example "www.shark-ml.org/index.html"
71
/// \param port TCP/IP port, defaults to 80
72
///
73
/// The function requests the document with a HTTP request and returns
74
/// the body of the corresponding HTTP reply. In case of success this
75
/// is the requested document. In case of an error the function throws
76
/// an exception. Note that the function does not perform standard
77
/// actions of web browsers, e.g., execute javascript or follow http
78
/// redirects. All HTTP response status codes other than 200 are
79
/// reported as failure to download the document and trigger an
80
/// exception.
81
SHARK_EXPORT_SYMBOL
std::string
download
(std::string
const
& url,
unsigned
short
port = 80);
82
83
/// \brief Download and import a sparse data (libSVM) file.
84
///
85
/// \param dataset container storing the loaded data
86
/// \param url http URL
87
/// \param port TCP/IP port, default is 80
88
/// \param highestIndex highest feature index, or 0 for auto-detection
89
/// \param batchSize size of batch
90
template
<
class
InputType,
class
LabelType>
void
downloadSparseData
(
91
LabeledData<InputType, LabelType>
& dataset,
92
std::string
const
& url,
93
unsigned
short
port = 80,
94
unsigned
int
highestIndex = 0,
95
std::size_t
batchSize
=
LabeledData<RealVector, unsigned int>::DefaultBatchSize
96
)
97
{
98
std::string content =
download
(url, port);
99
std::stringstream ss(content);
100
importSparseData
(dataset, ss, highestIndex,
batchSize
);
101
}
102
103
104
/// \brief Download and import a dense data (CSV) file for classification.
105
///
106
/// \param dataset container storing the loaded data
107
/// \param url http URL
108
/// \param lp Position of the label in the record, either first or last column
109
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
110
/// \param comment Trailing character indicating comment line. By dfault it is '#'
111
/// \param port TCP/IP port, default is 80
112
/// \param maximumBatchSize size of batches in the dataset
113
template
<
class
InputType>
void
downloadCsvData
(
114
LabeledData<InputType, unsigned int>
& dataset,
115
std::string
const
& url,
116
LabelPosition
lp,
117
char
separator =
','
,
118
char
comment =
'#'
,
119
unsigned
short
port = 80,
120
std::size_t maximumBatchSize =
LabeledData<RealVector, RealVector>::DefaultBatchSize
121
)
122
{
123
std::string content =
download
(url, port);
124
csvStringToData
(dataset, content, lp, separator, comment, maximumBatchSize);
125
}
126
127
128
/// \brief Download and import a dense data (CSV) file for regression.
129
///
130
/// \param dataset container storing the loaded data
131
/// \param url http URL
132
/// \param lp Position of the label in the record, either first or last column
133
/// \param numberOfOutputs dimensionality of the labels
134
/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
135
/// \param comment Trailing character indicating comment line. By dfault it is '#'
136
/// \param port TCP/IP port, default is 80
137
/// \param maximumBatchSize size of batches in the dataset
138
template
<
class
InputType>
void
downloadCsvData
(
139
LabeledData<InputType, RealVector>
& dataset,
140
std::string
const
& url,
141
LabelPosition
lp,
142
std::size_t numberOfOutputs = 1,
143
char
separator =
','
,
144
char
comment =
'#'
,
145
unsigned
short
port = 80,
146
std::size_t maximumBatchSize =
LabeledData<RealVector, RealVector>::DefaultBatchSize
147
)
148
{
149
std::string content =
download
(url, port);
150
csvStringToData
(dataset, content, lp, numberOfOutputs, separator, comment, maximumBatchSize);
151
}
152
153
154
/** @}*/
155
156
}
157
#endif