Pooling.h
Go to the documentation of this file.
1#ifndef SHARK_CORE_IMAGES_OPENCL_POOLING_H
2#define SHARK_CORE_IMAGES_OPENCL_POOLING_H
3
4#include <shark/LinAlg/Base.h>
6#include <shark/Core/Shape.h>
7namespace shark{
8namespace image{
9template<class T>
11 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg,
12 Shape const& shape,
13 Shape const& patchSize,
14 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> outputs_unreg
15){
16
17 std::size_t depth = shape[2];
18 std::size_t outputHeight = shape[0]/patchSize[0];
19 std::size_t outputWidth = shape[1]/patchSize[1];
20 std::size_t outputPixels = outputWidth * outputHeight;
21 SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
22 SIZE_CHECK(outputs_unreg.size2() == outputPixels * depth);
23 SIZE_CHECK(outputs_unreg.size1() == inputs_unreg.size1());
24
25 blas::gpu::detail::meta_kernel k("shark_max_pooling");
26 std::size_t width_index = k.add_arg<std::size_t>("width");
27 std::size_t height_index = k.add_arg<std::size_t>("height");
28 std::size_t depth_index = k.add_arg<std::size_t>("depth");
29 std::size_t sizeH_index = k.add_arg<std::size_t>("sizeH");
30 std::size_t sizeW_index = k.add_arg<std::size_t>("sizeW");
31 std::size_t numImages_index = k.add_arg<std::size_t>("numImages");
32 auto inputs = k.register_args(to_functor(inputs_unreg));
33 auto outputs = k.register_args(to_functor(outputs_unreg));
34
35 k << "const ulong outputWidth = width / sizeW;\n";
36 k << "const ulong outputHeight = height / sizeH;\n";
37 k << "const ulong numOutputs = outputWidth * outputHeight;\n";
38 k << "const ulong id = get_global_id(0);\n";
39 k << "if(id >= numImages * numOutputs) return;\n"; //bounds checking for groups
40
41 k << "const ulong im = id / numOutputs;\n";//extract image id
42 k << "const ulong p = id % numOutputs;\n";//extract patch id
43
44 //get start and end-coordinates of the patch
45 k << "const ulong starti = (p / outputWidth) * sizeH;\n";
46 k << "const ulong startj = (p % outputWidth) * sizeW;\n";
47 k << "const ulong endi = starti + sizeH;\n";
48 k << "const ulong endj = startj + sizeW;\n";
49 k << "for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
50 k << " ulong index = (starti * width + startj) * depth +c;\n";
51 auto im = k.expr<cl_ulong>("im");
52 auto index = k.expr<cl_ulong>("index");
53 //traverse the patch on the input image and compute maximum
54 k << " " << k.decl<T>("val") <<" = "<< inputs(im, index) << ";\n";
55 k << " for(ulong i = starti; i != endi; ++i){\n";
56 k << " for(ulong j = startj; j != endj; ++j){\n";
57 k << " index = (i * width + j) * depth + c;\n";
58 k << " val = max(val,"<<inputs(im, index)<<");\n";
59 k << " }\n";
60 k << " }\n";
61 k << " " << outputs(im, k.expr<cl_ulong>("(p * depth + c)"))<<" = val;\n";
62 k << "}\n";
63
64 //compile kernel
65 boost::compute::kernel kernel = k.compile(outputs_unreg.queue().get_context());
66
67 //enqueue kernel with kernel args
68 kernel.set_arg(height_index, shape[0]);
69 kernel.set_arg(width_index, shape[1]);
70 kernel.set_arg(depth_index, shape[2]);
71 kernel.set_arg(sizeH_index, patchSize[0]);
72 kernel.set_arg(sizeW_index, patchSize[1]);
73 kernel.set_arg(numImages_index, inputs_unreg.size1());
74
75
76 std::size_t local_work_size[2] = {8, 4};
77 //round global work size up to next multiple of local work size
78 std::size_t global_work_size[2] = {
79 ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] ,
80 ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]
81 };
82 outputs_unreg.queue().enqueue_nd_range_kernel(kernel, 2, nullptr, global_work_size, local_work_size);
83}
84
85template<class T>
87 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg,
88 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> coefficients_unreg,
89 Shape const& shape,
90 Shape const& patchSize,
91 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> derivatives_unreg
92){
93 derivatives_unreg.clear();
94 std::size_t depth = shape[2];
95 std::size_t outputHeight = shape[0]/patchSize[0];
96 std::size_t outputWidth = shape[1]/patchSize[1];
97 std::size_t outputPixels = outputWidth * outputHeight;
98 SIZE_CHECK(derivatives_unreg.size2() == shape[0] * shape[1] * depth);
99 SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
100 SIZE_CHECK(coefficients_unreg.size2() == outputPixels * depth);
101 SIZE_CHECK(derivatives_unreg.size1() == inputs_unreg.size1());
102 SIZE_CHECK(derivatives_unreg.size1() == coefficients_unreg.size1());
103
104 blas::gpu::detail::meta_kernel k("shark_max_pooling_derivative");
105 std::size_t width_index = k.add_arg<std::size_t>("width");
106 std::size_t height_index = k.add_arg<std::size_t>("height");
107 std::size_t depth_index = k.add_arg<std::size_t>("depth");
108 std::size_t sizeH_index = k.add_arg<std::size_t>("sizeH");
109 std::size_t sizeW_index = k.add_arg<std::size_t>("sizeW");
110 std::size_t numImages_index = k.add_arg<std::size_t>("numImages");
111 auto inputs = k.register_args(to_functor(inputs_unreg));
112 auto coefficients = k.register_args(to_functor(coefficients_unreg));
113 auto derivatives = k.register_args(to_functor(derivatives_unreg));
114
115 k << "const ulong outputWidth = width / sizeW;\n";
116 k << "const ulong outputHeight = height / sizeH;\n";
117 k << "const ulong numOutputs = outputWidth * outputHeight;\n";
118 k << "const ulong id = get_global_id(0);\n";
119 k << "if(id >= numImages * numOutputs) return;\n"; //bounds checking for groups
120
121 k << "const ulong im = id / numOutputs;\n";//extract image id
122 k << "const ulong p = id % numOutputs;\n";//extract patch id
123
124 //get start and end-coordinates of the patch
125 k << "const ulong starti = (p / outputWidth) * sizeH;\n";
126 k << "const ulong startj = (p % outputWidth) * sizeW;\n";
127 k << "const ulong endi = starti + sizeH;\n";
128 k << "const ulong endj = startj + sizeW;\n";
129 k << "for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
130 k << " ulong index = (starti * width + startj) * depth +c;\n";
131 auto im = k.expr<cl_ulong>("im");
132 auto index = k.expr<cl_ulong>("index");
133 //traverse the patch on the input image and compute maximum
134 k << " " << k.decl<T>("maxVal")<<" = " <<inputs(im, index) << ";\n";
135 k << " ulong maxIndex = index;\n";
136 k << " for(ulong i = starti; i != endi; ++i){\n";
137 k << " for(ulong j = startj; j != endj; ++j){\n";
138 k << " index = (i * width + j) * depth + c;\n";
139 k << " if("<<inputs(im, index)<<" > maxVal){\n";
140 k << " maxVal = "<<inputs(im, index)<<";\n";
141 k << " maxIndex = index;\n";
142 k << " }\n";
143 k << " }\n";
144 k << " }\n";
145 k << " " << derivatives(im, k.expr<cl_ulong>("maxIndex")) << " = " << coefficients(im,k.expr<cl_ulong>("(p * depth +c)")) << ";\n";
146 k << "}\n";
147
148 //compile kernel
149 boost::compute::kernel kernel = k.compile(derivatives_unreg.queue().get_context());
150
151 //enqueue kernel with kernel args
152 kernel.set_arg(height_index, shape[0]);
153 kernel.set_arg(width_index, shape[1]);
154 kernel.set_arg(depth_index, shape[2]);
155 kernel.set_arg(sizeH_index, patchSize[0]);
156 kernel.set_arg(sizeW_index, patchSize[1]);
157 kernel.set_arg(numImages_index, inputs_unreg.size1());
158
159
160 std::size_t local_work_size[2] = {8, 4};
161 //round global work size up to next multiple of local work size
162 std::size_t global_work_size[2] = {
163 ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] ,
164 ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]
165 };
166 derivatives_unreg.queue().enqueue_nd_range_kernel(kernel, 2, nullptr, global_work_size, local_work_size);
167}
168
169
170}}
171
172#endif