11 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg,
13 Shape const& patchSize,
14 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> outputs_unreg
17 std::size_t depth = shape[2];
18 std::size_t outputHeight = shape[0]/patchSize[0];
19 std::size_t outputWidth = shape[1]/patchSize[1];
20 std::size_t outputPixels = outputWidth * outputHeight;
21 SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
22 SIZE_CHECK(outputs_unreg.size2() == outputPixels * depth);
23 SIZE_CHECK(outputs_unreg.size1() == inputs_unreg.size1());
25 blas::gpu::detail::meta_kernel k(
"shark_max_pooling");
26 std::size_t width_index = k.add_arg<std::size_t>(
"width");
27 std::size_t height_index = k.add_arg<std::size_t>(
"height");
28 std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
29 std::size_t sizeH_index = k.add_arg<std::size_t>(
"sizeH");
30 std::size_t sizeW_index = k.add_arg<std::size_t>(
"sizeW");
31 std::size_t numImages_index = k.add_arg<std::size_t>(
"numImages");
32 auto inputs = k.register_args(to_functor(inputs_unreg));
33 auto outputs = k.register_args(to_functor(outputs_unreg));
35 k <<
"const ulong outputWidth = width / sizeW;\n";
36 k <<
"const ulong outputHeight = height / sizeH;\n";
37 k <<
"const ulong numOutputs = outputWidth * outputHeight;\n";
38 k <<
"const ulong id = get_global_id(0);\n";
39 k <<
"if(id >= numImages * numOutputs) return;\n";
41 k <<
"const ulong im = id / numOutputs;\n";
42 k <<
"const ulong p = id % numOutputs;\n";
45 k <<
"const ulong starti = (p / outputWidth) * sizeH;\n";
46 k <<
"const ulong startj = (p % outputWidth) * sizeW;\n";
47 k <<
"const ulong endi = starti + sizeH;\n";
48 k <<
"const ulong endj = startj + sizeW;\n";
49 k <<
"for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
50 k <<
" ulong index = (starti * width + startj) * depth +c;\n";
51 auto im = k.expr<cl_ulong>(
"im");
52 auto index = k.expr<cl_ulong>(
"index");
54 k <<
" " << k.decl<T>(
"val") <<
" = "<< inputs(im, index) <<
";\n";
55 k <<
" for(ulong i = starti; i != endi; ++i){\n";
56 k <<
" for(ulong j = startj; j != endj; ++j){\n";
57 k <<
" index = (i * width + j) * depth + c;\n";
58 k <<
" val = max(val,"<<inputs(im, index)<<
");\n";
61 k <<
" " << outputs(im, k.expr<cl_ulong>(
"(p * depth + c)"))<<
" = val;\n";
65 boost::compute::kernel kernel = k.compile(outputs_unreg.queue().get_context());
68 kernel.set_arg(height_index, shape[0]);
69 kernel.set_arg(width_index, shape[1]);
70 kernel.set_arg(depth_index, shape[2]);
71 kernel.set_arg(sizeH_index, patchSize[0]);
72 kernel.set_arg(sizeW_index, patchSize[1]);
73 kernel.set_arg(numImages_index, inputs_unreg.size1());
76 std::size_t local_work_size[2] = {8, 4};
78 std::size_t global_work_size[2] = {
79 ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] ,
80 ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]
82 outputs_unreg.queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);
87 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg,
88 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> coefficients_unreg,
90 Shape const& patchSize,
91 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> derivatives_unreg
93 derivatives_unreg.clear();
94 std::size_t depth = shape[2];
95 std::size_t outputHeight = shape[0]/patchSize[0];
96 std::size_t outputWidth = shape[1]/patchSize[1];
97 std::size_t outputPixels = outputWidth * outputHeight;
98 SIZE_CHECK(derivatives_unreg.size2() == shape[0] * shape[1] * depth);
99 SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
100 SIZE_CHECK(coefficients_unreg.size2() == outputPixels * depth);
101 SIZE_CHECK(derivatives_unreg.size1() == inputs_unreg.size1());
102 SIZE_CHECK(derivatives_unreg.size1() == coefficients_unreg.size1());
104 blas::gpu::detail::meta_kernel k(
"shark_max_pooling_derivative");
105 std::size_t width_index = k.add_arg<std::size_t>(
"width");
106 std::size_t height_index = k.add_arg<std::size_t>(
"height");
107 std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
108 std::size_t sizeH_index = k.add_arg<std::size_t>(
"sizeH");
109 std::size_t sizeW_index = k.add_arg<std::size_t>(
"sizeW");
110 std::size_t numImages_index = k.add_arg<std::size_t>(
"numImages");
111 auto inputs = k.register_args(to_functor(inputs_unreg));
112 auto coefficients = k.register_args(to_functor(coefficients_unreg));
113 auto derivatives = k.register_args(to_functor(derivatives_unreg));
115 k <<
"const ulong outputWidth = width / sizeW;\n";
116 k <<
"const ulong outputHeight = height / sizeH;\n";
117 k <<
"const ulong numOutputs = outputWidth * outputHeight;\n";
118 k <<
"const ulong id = get_global_id(0);\n";
119 k <<
"if(id >= numImages * numOutputs) return;\n";
121 k <<
"const ulong im = id / numOutputs;\n";
122 k <<
"const ulong p = id % numOutputs;\n";
125 k <<
"const ulong starti = (p / outputWidth) * sizeH;\n";
126 k <<
"const ulong startj = (p % outputWidth) * sizeW;\n";
127 k <<
"const ulong endi = starti + sizeH;\n";
128 k <<
"const ulong endj = startj + sizeW;\n";
129 k <<
"for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
130 k <<
" ulong index = (starti * width + startj) * depth +c;\n";
131 auto im = k.expr<cl_ulong>(
"im");
132 auto index = k.expr<cl_ulong>(
"index");
134 k <<
" " << k.decl<T>(
"maxVal")<<
" = " <<inputs(im, index) <<
";\n";
135 k <<
" ulong maxIndex = index;\n";
136 k <<
" for(ulong i = starti; i != endi; ++i){\n";
137 k <<
" for(ulong j = startj; j != endj; ++j){\n";
138 k <<
" index = (i * width + j) * depth + c;\n";
139 k <<
" if("<<inputs(im, index)<<
" > maxVal){\n";
140 k <<
" maxVal = "<<inputs(im, index)<<
";\n";
141 k <<
" maxIndex = index;\n";
145 k <<
" " << derivatives(im, k.expr<cl_ulong>(
"maxIndex")) <<
" = " << coefficients(im,k.expr<cl_ulong>(
"(p * depth +c)")) <<
";\n";
149 boost::compute::kernel kernel = k.compile(derivatives_unreg.queue().get_context());
152 kernel.set_arg(height_index, shape[0]);
153 kernel.set_arg(width_index, shape[1]);
154 kernel.set_arg(depth_index, shape[2]);
155 kernel.set_arg(sizeH_index, patchSize[0]);
156 kernel.set_arg(sizeW_index, patchSize[1]);
157 kernel.set_arg(numImages_index, inputs_unreg.size1());
160 std::size_t local_work_size[2] = {8, 4};
162 std::size_t global_work_size[2] = {
163 ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] ,
164 ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]
166 derivatives_unreg.queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);