KHCTree.h
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief Tree for nearest neighbor search in kernel-induced feature spaces.
6 *
7 *
8 *
9 * \author T. Glasmachers
10 * \date 2011
11 *
12 *
13 * \par Copyright 1995-2017 Shark Development Team
14 *
15 * <BR><HR>
16 * This file is part of Shark.
17 * <https://shark-ml.github.io/Shark/>
18 *
19 * Shark is free software: you can redistribute it and/or modify
20 * it under the terms of the GNU Lesser General Public License as published
21 * by the Free Software Foundation, either version 3 of the License, or
22 * (at your option) any later version.
23 *
24 * Shark is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU Lesser General Public License for more details.
28 *
29 * You should have received a copy of the GNU Lesser General Public License
30 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
31 *
32 */
33//===========================================================================
34
35#ifndef SHARK_ALGORITHMS_NEARESTNEIGHBORS_KHCTREE_H
36#define SHARK_ALGORITHMS_NEARESTNEIGHBORS_KHCTREE_H
37
38
40#include <shark/LinAlg/Base.h>
41#include <boost/array.hpp>
42
43namespace shark {
44
45
46///
47/// \brief KHC-tree, a binary space-partitioning tree
48///
49/// \par
50/// KHC-tree data structure for efficient nearest
51/// neighbor searches in kernel-induced feature
52/// spaces.
53/// "KHC" stands for Kernel Hierarchical Clustering.
54/// The space separation is based on distances to
55/// cluster centers.
56///
57/// \par
58/// The tree is constructed from data by splitting
59/// along the pair of data points with largest
60/// distance. This quantity is approximated using
61/// a given number of randomly sampled pairs, which
62/// is controlled by the template parameter
63/// CuttingAccuracy.
64///
65/// \par
66/// The bucket size for the tree is always one,
67/// such that there is a unique point in each leaf
68/// cell.
69///
70///Since the KHCTree needs direct access to the elements, it's template parameter is not the actual point type
71///But the Range, the points are stored in. Be aware that this range should be a View when a Dataset is used as storage,
72///since during construction, the KHC-Tree needs random access to the elements.
73/// \ingroup space_trees
74template <class Container, int CuttingAccuracy = 25>
75class KHCTree : public BinaryTree<typename Container::value_type>
76{
77public:
79 typedef typename Container::value_type value_type;
82
83 /// Construct the tree from data.
84 /// It is assumed that the container exceeds
85 /// the lifetime of the KHCTree (which holds
86 /// only references to the points), and that
87 /// the memory locations of the points remain
88 /// unchanged.
89 KHCTree(Container const& points, kernel_type const* kernel, TreeConstruction tc = TreeConstruction())
90 : base_type(points.size())
92 , m_normalInvNorm(1.0)
93 {
94 //create a list to the iterator elements as temporary storage
95 //we need indexed operators to have a fast lookup of the position of the elements in the container
96 std::vector<const_iterator> elements(m_size);
97 boost::iota(elements,const_iterator(boost::begin(points),0));
98
99 buildTree(tc,elements);
100
101 //after the creation of the trees, the iterators in the array are sorted in the order,
102 //they are referenced by the nodes.so we can create the indexList using the indizes of the iterators
103 for(std::size_t i = 0; i != m_size; ++i){
104 mp_indexList[i] = elements[i].index();
105 }
106 }
107
108
109 /// \par
110 /// Compute the squared distance of this cell to
111 /// the given reference point, or alternatively
112 /// a lower bound on this value.
113 double squaredDistanceLowerBound(value_type const& reference) const{
114 double dist = 0.0;
115 KHCTree const* t = this;
116 KHCTree const* p = (KHCTree const*)mep_parent;
117 while (p != NULL){
118 double v = p->distanceFromPlane(reference);
119 if (t == p->mp_right)
120 v = -v;
121 if (v > dist)
122 dist = v;
123 t = p;
124 p = (KHCTree const*)p->mep_parent;
125 }
126 return (dist * dist);
127 }
128
129protected:
131 using base_type::mp_left;
134 using base_type::m_size;
135 using base_type::m_nodes;
136
137 /// (internal) construction of a non-root node
138 KHCTree(KHCTree* parent, std::size_t* list, std::size_t size)
139 : base_type(parent, list, size)
141 , m_normalInvNorm(1.0)
142 { }
143
144 /// (internal) construction method:
145 /// median-cuts of the dimension with widest spread
146 template<class Range>
147 void buildTree(TreeConstruction tc, Range& points){
148 typedef typename Range::iterator range_iterator;
149
150 //check whether we are finished
151 if (tc.maxDepth() == 0 || m_size <= tc.maxBucketSize()) {
152 m_nodes = 1;
153 return;
154 }
155
156 // use only a subset of size at most CuttingAccuracy
157 // to estimate the vector along the longest
158 // distance
159 if (m_size <= CuttingAccuracy){
160 calculateNormal(points);
161 }
162 else{
163 boost::array<const_iterator,CuttingAccuracy> samples;
164 for(std::size_t i = 0; i != CuttingAccuracy; i++)
165 samples[i] = points[m_size * (2*i+1) / (2*CuttingAccuracy)];
166 calculateNormal(samples);
167 }
168
169 //calculate the distance from the plane for every point in the list
170 std::vector<double> distance(m_size);
171 for(std::size_t i = 0; i != m_size; ++i){
172 distance[i] = funct(*points[i]);
173 }
174
175
176 // split the list into sub-cells
177 range_iterator split = this->splitList(distance,points);
178 range_iterator begin = boost::begin(points);
179 range_iterator end = boost::end(points);
180
181 if (split == end) {//can't split points.
182 m_nodes = 1;
183 return;
184 }
185
186 // create sub-nodes
187 std::size_t leftSize = split-begin;
188 mp_left = new KHCTree(this, mp_indexList, leftSize);
189 mp_right = new KHCTree(this, mp_indexList + leftSize, m_size - leftSize);
190
191 // recurse
192 boost::iterator_range<range_iterator> left(begin,split);
193 boost::iterator_range<range_iterator> right(split,end);
194 ((KHCTree*)mp_left)->buildTree(tc.nextDepthLevel(),left);
195 ((KHCTree*)mp_right)->buildTree(tc.nextDepthLevel(),right);
196 m_nodes = 1 + mp_left->nodes() + mp_right->nodes();
197 }
198
199 template<class Range>
200 void calculateNormal(Range const& samples){
201 std::size_t numSamples =batchSize(samples);
202 double best_dist2 = -1.0;
203 for (std::size_t i=1; i != numSamples; i++){
204 for (std::size_t j = 0; j != i; j++){
205 double dist2 = mep_kernel->featureDistanceSqr(*samples[i], *samples[j]);
206 if (dist2 > best_dist2){
207 best_dist2 = dist2;
208 mep_positive = samples[i];
209 mep_negative = samples[j];
210 }
211 }
212 }
213 m_normalInvNorm = 1.0 / std::sqrt(best_dist2);
214 if (! (boost::math::isfinite)(m_normalInvNorm))
215 m_normalInvNorm = 1.0;
216 }
217
218
219
220 /// function describing the separating hyperplane
221 double funct(value_type const& reference) const{
222 double result = mep_kernel->eval(*mep_positive, reference);
223 result -= mep_kernel->eval(*mep_negative, reference);
224 result *= m_normalInvNorm;
225 return result;
226 }
227
228 /// kernel function
230
231 /// "positive" side cluster center
233
234 /// "negative" side cluster center
236
237 /// one divided by squared distance between "positive" and "negative" cluster center
239};
240
241
242}
243#endif