Download.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Support for downloading data sets from online sources.
6 *
7 *
8 * \par
9 * The methods in this file allow to download data sets from the
10 * mldata.org repository and other sources.
11 *
12 *
13 *
14 *
15 * \author T. Glasmachers
16 * \date 2016-2018
17 *
18 *
19 * \par Copyright 1995-2018 Shark Development Team
20 *
21 * <BR><HR>
22 * This file is part of Shark.
23 * <https://shark-ml.github.io/Shark/>
24 *
25 * Shark is free software: you can redistribute it and/or modify
26 * it under the terms of the GNU Lesser General Public License as published
27 * by the Free Software Foundation, either version 3 of the License, or
28 * (at your option) any later version.
29 *
30 * Shark is distributed in the hope that it will be useful,
31 * but WITHOUT ANY WARRANTY; without even the implied warranty of
32 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
33 * GNU Lesser General Public License for more details.
34 *
35 * You should have received a copy of the GNU Lesser General Public License
36 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
37 *
38 */
39//===========================================================================
40
41#ifndef SHARK_DATA_DOWNLOAD_H
42#define SHARK_DATA_DOWNLOAD_H
43
45#include <shark/Data/Dataset.h>
47#include <shark/Data/Csv.h>
48#include <sstream>
49
50namespace shark {
51
52/**
53 * \ingroup shark_globals
54 *
55 * @{
56 */
57
58
59/// \brief Split a URL into its domain and resource parts.
60///
61/// Returns a std::tuple where the first element indicates whether a
62/// secure socket is used, the second one is the domain, and the third
63/// is the resource. With std::tie you can do pattern-matching:
64/// std::tie(https, domain, resource) = splitUrl(url);
65/// will fill the variables https, domain and resource.
66SHARK_EXPORT_SYMBOL std::tuple<bool, std::string, std::string> splitUrl(std::string const & url);
67
68/// \brief Download a document with the HTTP protocol.
69///
70/// \param url download URL, for example "www.shark-ml.org/index.html"
71/// \param port TCP/IP port, defaults to 80
72///
73/// The function requests the document with a HTTP request and returns
74/// the body of the corresponding HTTP reply. In case of success this
75/// is the requested document. In case of an error the function throws
76/// an exception. Note that the function does not perform standard
77/// actions of web browsers, e.g., execute javascript or follow http
78/// redirects. All HTTP response status codes other than 200 are
79/// reported as failure to download the document and trigger an
80/// exception.
81SHARK_EXPORT_SYMBOL std::string download(std::string const& url, unsigned short port = 80);
82
83/// \brief Download and import a sparse data (libSVM) file.
84///
85/// \param dataset container storing the loaded data
86/// \param url http URL
87/// \param port TCP/IP port, default is 80
88/// \param highestIndex highest feature index, or 0 for auto-detection
89/// \param batchSize size of batch
90template <class InputType, class LabelType> void downloadSparseData(
92 std::string const& url,
93 unsigned short port = 80,
94 unsigned int highestIndex = 0,
96)
97{
98 std::string content = download(url, port);
99 std::stringstream ss(content);
100 importSparseData(dataset, ss, highestIndex, batchSize);
101}
102
103
104/// \brief Download and import a dense data (CSV) file for classification.
105///
106/// \param dataset container storing the loaded data
107/// \param url http URL
108/// \param lp Position of the label in the record, either first or last column
109/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
110/// \param comment Trailing character indicating comment line. By dfault it is '#'
111/// \param port TCP/IP port, default is 80
112/// \param maximumBatchSize size of batches in the dataset
113template <class InputType> void downloadCsvData(
115 std::string const& url,
116 LabelPosition lp,
117 char separator = ',',
118 char comment = '#',
119 unsigned short port = 80,
120 std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
121)
122{
123 std::string content = download(url, port);
124 csvStringToData(dataset, content, lp, separator, comment, maximumBatchSize);
125}
126
127
128/// \brief Download and import a dense data (CSV) file for regression.
129///
130/// \param dataset container storing the loaded data
131/// \param url http URL
132/// \param lp Position of the label in the record, either first or last column
133/// \param numberOfOutputs dimensionality of the labels
134/// \param separator Optional separator between entries, typically a comma, spaces ar automatically ignored
135/// \param comment Trailing character indicating comment line. By dfault it is '#'
136/// \param port TCP/IP port, default is 80
137/// \param maximumBatchSize size of batches in the dataset
138template <class InputType> void downloadCsvData(
140 std::string const& url,
141 LabelPosition lp,
142 std::size_t numberOfOutputs = 1,
143 char separator = ',',
144 char comment = '#',
145 unsigned short port = 80,
146 std::size_t maximumBatchSize = LabeledData<RealVector, RealVector>::DefaultBatchSize
147)
148{
149 std::string content = download(url, port);
150 csvStringToData(dataset, content, lp, numberOfOutputs, separator, comment, maximumBatchSize);
151}
152
153
154/** @}*/
155
156}
157#endif