KMeans.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief The k-means clustering algorithm.
6 *
7 *
8 *
9 * \author T. Glasmachers
10 * \date 2011
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35
36#ifndef SHARK_ALGORITHMS_KMEANS_H
37#define SHARK_ALGORITHMS_KMEANS_H
38
40#include <shark/Data/Dataset.h>
45
46
47namespace shark{
48
49
50///
51/// \brief The k-means clustering algorithm.
52///
53/// \par
54/// The k-means algorithm takes vector-valued data
55/// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$
56/// and splits it into k clusters, based on centroids
57/// \f$ \{c_1, \dots, c_k\} \f$.
58/// The result is stored in a Centroids object that can be used to
59/// construct clustering models.
60///
61/// \par
62/// This implementation starts the search with the given centroids,
63/// in case the provided centroids object (third parameter) contains
64/// a set of k centroids. Otherwise the search starts from the first
65/// k data points.
66///
67/// \par
68/// Note that the data set needs to include at least k data points
69/// for k-means to work. This is because the current implementation
70/// does not allow for empty clusters.
71///
72/// \param data vector-valued data to be clustered
73/// \param k number of clusters
74/// \param centroids centroids input/output
75/// \param maxIterations maximum number of k-means iterations; 0: unlimited
76/// \return number of k-means iterations
77/// \ingroup clustering
78SHARK_EXPORT_SYMBOL std::size_t kMeans(Data<RealVector> const& data, std::size_t k, Centroids& centroids, std::size_t maxIterations = 0);
79
80///
81/// \brief The k-means clustering algorithm for initializing an RBF Layer
82///
83/// \par
84/// The k-means algorithm takes vector-valued data
85/// \f$ \{x_1, \dots, x_n\} \subset \mathbb R^d \f$
86/// and splits it into k clusters, based on centroids
87/// \f$ \{c_1, \dots, c_k\} \f$.
88/// The result is stored in a RBFLayer object that can be used to
89/// construct clustering models.
90///
91/// \par
92/// This is just an alternative frontend to the version using Centroids. it creates a centroid object,
93/// with as many clusters as are outputs in the RBFLayer and copies the result into the model.
94///
95/// \par
96/// Note that the data set needs to include at least k data points
97/// for k-means to work. This is because the current implementation
98/// does not allow for empty clusters.
99///
100/// \param data vector-valued data to be clustered
101/// \param model RBFLayer input/output
102/// \param maxIterations maximum number of k-means iterations; 0: unlimited
103/// \return number of k-means iterations
104///
105SHARK_EXPORT_SYMBOL std::size_t kMeans(Data<RealVector> const& data, RBFLayer& model, std::size_t maxIterations = 0);
106
107///
108/// \brief The kernel k-means clustering algorithm
109///
110/// \par
111/// The kernel k-means algorithm takes data
112/// \f$ \{x_1, \dots, x_n\} \f$
113/// and splits it into k clusters, based on centroids
114/// \f$ \{c_1, \dots, c_k\} \f$.
115/// The centroids are elements of the reproducing kernel Hilbert space
116/// (RHKS) induced by the kernel function. They are functions, represented
117/// as the components of a KernelExpansion object. I.e., given a data point
118/// x, the kernel expansion returns a k-dimensional vector f(x), which is
119/// the evaluation of the centroid functions on x. The value of the
120/// centroid function represents the inner product of the centroid with
121/// the kernel-induced feature vector of x (embedding of x into the RKHS).
122/// The distance of x from the centroid \f$ c_i \f$ is computes as the
123/// kernel-induced distance
124/// \f$ \sqrt{ kernel(x, x) + kernel(c_i, c_i) - 2 kernel(x, c_i) } \f$.
125/// For the Gaussian kernel (and other normalized kernels) is simplifies to
126/// \f$ \sqrt{ 2 - 2 kernel(x, c_i) } \f$. Hence, larger function values
127/// indicate smaller distance to the centroid.
128///
129/// \par
130/// Note that the data set needs to include at least k data points
131/// for k-means to work. This is because the current implementation
132/// does not allow for empty clusters.
133///
134/// \param dataset vector-valued data to be clustered
135/// \param k number of clusters
136/// \param kernel kernel function object
137/// \param maxIterations maximum number of k-means iterations; 0: unlimited
138/// \return centroids (represented as functions, see description)
139///
140template<class InputType>
141KernelExpansion<InputType> kMeans(Data<InputType> const& dataset, std::size_t k, AbstractKernelFunction<InputType>& kernel, std::size_t maxIterations = 0){
142 if(!maxIterations)
143 maxIterations = std::numeric_limits<std::size_t>::max();
144
145 std::size_t n = dataset.numberOfElements();
146 RealMatrix kernelMatrix = calculateRegularizedKernelMatrix(kernel,dataset,0);
147 UIntVector clusterMembership(n);
148 UIntVector clusterSizes(k,0);
149 RealVector ckck(k,0);
150
151 //init cluster assignments
152 for(unsigned int i = 0; i != n; ++i){
153 clusterMembership(i) = i % k;
154 }
155 std::shuffle(clusterMembership.begin(),clusterMembership.end(),random::globalRng);
156 for(std::size_t i = 0; i != n; ++i){
157 ++clusterSizes(clusterMembership(i));
158 }
159
160 // k-means loop
161 std::size_t iter = 0;
162 bool equal = false;
163 for(; iter != maxIterations && !equal; ++iter) {
164 //the clustering model results in centers c_k= 1/n_k sum_i k(x_i,.) for all x_i points of cluster k
165 //we need to compute the squared distances between all centers and points that is
166 //d^2(c_k,x_i) = <c_k,c_k> -2 < c_k,x_i> + <x_i,x_i> for the i-th point.
167 //thus we precompute <c_k,c_k>= sum_ij k(x_i,x_j)/(n_k)^2 for all x_i,x_j points of cluster k
168 ckck.clear();
169 for(std::size_t i = 0; i != n; ++i){
170 std::size_t c1 = clusterMembership(i);
171 for(std::size_t j = 0; j != n; ++j){
172 std::size_t c2 = clusterMembership(j);
173 if(c1 != c2) continue;
174 ckck(c1) += kernelMatrix(i,j);
175 }
176 }
177 noalias(ckck) = safe_div(ckck,sqr(clusterSizes),0);
178
179 UIntVector newClusterMembership(n);
180 RealVector currentDistances(k);
181 for(std::size_t i = 0; i != n; ++i){
182 //compute squared distances between the i-th point and the centers
183 //we skip <x_i,x_i> as it is always the same for all elements and we don't need it for comparison
184 noalias(currentDistances) = ckck;
185 for(std::size_t j = 0; j != n; ++j){
186 std::size_t c = clusterMembership(j);
187 currentDistances(c) -= 2* kernelMatrix(i,j)/clusterSizes(c);
188 }
189 //choose the index with the smallest distance as new cluster
190 newClusterMembership(i) = (unsigned int) arg_min(currentDistances);
191 }
192 equal = boost::equal(
193 newClusterMembership,
194 clusterMembership
195 );
196 noalias(clusterMembership) = newClusterMembership;
197 //compute new sizes of clusters
198 clusterSizes.clear();
199 for(std::size_t i = 0; i != n; ++i){
200 ++clusterSizes(clusterMembership(i));
201 }
202
203 //if a cluster is empty then assign a random point to it
204 for(unsigned int i = 0; i != k; ++i){
205 if(clusterSizes(i) == 0){
206 std::size_t elem = random::uni(random::globalRng, std::size_t(0), n-1);
207 --clusterSizes(clusterMembership(elem));
208 clusterMembership(elem)=i;
209 clusterSizes(i) = 1;
210 }
211 }
212 }
213
214 //copy result in the expansion
216 expansion.setStructure(&kernel,dataset,true,k);
217 expansion.offset() = -ckck;
218 expansion.alpha().clear();
219 for(std::size_t i = 0; i != n; ++i){
220 std::size_t c = clusterMembership(i);
221 expansion.alpha()(i,c) = 2.0 / clusterSizes(c);
222 }
223
224 return expansion;
225}
226
227} // namespace shark
228#endif