10 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> images_unreg,
12 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> points_unreg, std::size_t pointsPerImage,
13 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> values_unreg
18 blas::gpu::detail::meta_kernel k(
"shark_spline_interpolation_2D");
19 std::size_t width_index = k.add_arg<std::size_t>(
"width");
20 std::size_t height_index = k.add_arg<std::size_t>(
"height");
21 std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
22 std::size_t stride_index = k.add_arg<std::size_t>(
"stride");
23 auto images = k.register_args(to_functor(images_unreg));
24 auto points = k.register_args(to_functor(points_unreg));
25 auto values = k.register_args(to_functor(values_unreg));
27 k <<
"const ulong im = get_global_id(0);\n";
28 k <<
"const ulong p = get_global_id(1);\n";
29 k <<
"const ulong pointIdx = p + im * stride;\n";
32 auto pointx = points(k.expr<cl_ulong>(
"pointIdx"),1);
33 auto pointy = points(k.expr<cl_ulong>(
"pointIdx"),0);
34 k <<
"int basex = (int) ("<<pointx<<
" * width);\n";
35 k <<
"int basey = (int) ("<<pointy<<
" * height);\n";
37 k <<
"int px[4] = {clamp(basex - 1, 0, (int)width - 1), clamp(basex, 0, (int)width - 1), clamp(basex + 1, 0, (int)width - 1), clamp(basex + 2, 0, (int)width - 1)};\n";
38 k <<
"int py[4] = {clamp(basey - 1, 0, (int)height - 1), clamp(basey, 0, (int)height - 1), clamp(basey + 1, 0, (int)height - 1), clamp(basey + 2, 0, (int)height - 1)};\n";
41 k << k.decl<T>(
"t") <<
"=" << pointx <<
" * width - basex;\n";
42 k << k.decl<T>(
"t2") <<
" = t * t;\n";
43 k << k.decl<T>(
"t3") <<
" = t2 * t;\n";
44 k << k.decl<T>(
"x")<<
"[4]={-t3+3*t2-3*t+1, 3*t3-6*t2+4, -3*t3+3*t2+3*t+1, t3};\n";
47 k <<
"t = " << pointy <<
" * height - basey;\n";
49 k <<
"t3 = t2 * t;\n";
50 k << k.decl<T>(
"y")<<
"[4]={-t3+3*t2-3*t+1, 3*t3-6*t2+4, -3*t3+3*t2+3*t+1, t3};\n";
53 k <<
"for(int k = 0;k < 4; ++k){\n";
54 k <<
" for(int l = 0;l < 4; ++l){\n";
55 k <<
" const ulong imIdx = (px[l] + width * py[k]) * depth;\n";
56 k <<
" const ulong valIdx = p * depth;\n";
57 k <<
" for(int c = 0;c < depth; ++c){\n";
58 k <<
" "<<values(k.expr<cl_ulong>(
"im"), k.expr<cl_ulong>(
"valIdx + c") )
59 <<
" += (x[l]*y[k]/36) * "<<images(k.expr<cl_ulong>(
"im"), k.expr<cl_ulong>(
"imIdx + c" ))<<
";\n";
65 boost::compute::kernel kernel = k.compile(values_unreg.queue().get_context());
68 std::size_t stride = (pointsPerImage == points_unreg.size1())? 0: pointsPerImage;
69 kernel.set_arg(height_index, shape[0]);
70 kernel.set_arg(width_index, shape[1]);
71 kernel.set_arg(depth_index, shape[2]);
72 kernel.set_arg(stride_index, stride);
74 std::size_t global_work_size[2] = {images_unreg.size1(), pointsPerImage};
75 values_unreg.queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size,
nullptr);
80 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> imageDerivatives_unreg,
82 blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> points_unreg, std::size_t pointsPerImage,
83 blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> results_unreg
86 results_unreg.clear();
92 blas::gpu::detail::meta_kernel k(
"shark_spline_interpolation_2D_derivative");
93 std::size_t width_index = k.add_arg<std::size_t>(
"width");
94 std::size_t height_index = k.add_arg<std::size_t>(
"height");
95 std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
96 std::size_t stride_index = k.add_arg<std::size_t>(
"stride");
97 std::size_t numPoints_index = k.add_arg<std::size_t>(
"numPoints");
98 auto imageDerivatives = k.register_args(to_functor(imageDerivatives_unreg));
99 auto points = k.register_args(to_functor(points_unreg));
100 auto results = k.register_args(to_functor(results_unreg));
102 k <<
"const ulong im = get_group_id(0);\n";
103 k <<
"const ulong c = get_local_id(1);\n";
105 k <<
"for(int p = 0;p < numPoints; ++p){\n";
106 k <<
" const ulong pointIdx = p + im * stride;\n";
107 auto pointx = points(k.expr<cl_ulong>(
"pointIdx"),1);
108 auto pointy = points(k.expr<cl_ulong>(
"pointIdx"),0);
109 k <<
" int basex = (int) ("<<pointx<<
" * width);\n";
110 k <<
" int basey = (int) ("<<pointy<<
" * height);\n";
112 k <<
" int px[4] = {clamp(basex - 1, 0, (int)width - 1), clamp(basex, 0, (int)width - 1), clamp(basex + 1, 0, (int)width - 1), clamp(basex + 2, 0, (int)width - 1)};\n";
113 k <<
" int py[4] = {clamp(basey - 1, 0, (int)height - 1), clamp(basey, 0, (int)height - 1), clamp(basey + 1, 0, (int)height - 1), clamp(basey + 2, 0, (int)height - 1)};\n";
116 k << k.decl<T>(
"t") <<
"=" << pointx <<
" * width - basex;\n";
117 k << k.decl<T>(
"t2") <<
" = t * t;\n";
118 k << k.decl<T>(
"t3") <<
" = t2 * t;\n";
119 k << k.decl<T>(
"x")<<
"[4]={-t3+3*t2-3*t+1, 3*t3-6*t2+4, -3*t3+3*t2+3*t+1, t3};\n";
122 k <<
" t = " << pointy <<
" * height - basey;\n";
123 k <<
" t2 = t * t;\n";
124 k <<
" t3 = t2 * t;\n";
125 k << k.decl<T>(
"y")<<
"[4]={-t3+3*t2-3*t+1, 3*t3-6*t2+4, -3*t3+3*t2+3*t+1, t3};\n";
128 k <<
" for(int k = 0;k < 4; ++k){\n";
129 k <<
" for(int l = 0;l < 4; ++l){\n";
130 k <<
" const ulong imIdx = (px[l] + width * py[k]) * depth;\n";
131 k <<
" const ulong valIdx = p * depth;\n";
132 k <<
" "<<results(k.expr<cl_ulong>(
"im"), k.expr<cl_ulong>(
"imIdx + c" ))
133 <<
" += (x[l]*y[k]/36) * "<<imageDerivatives(k.expr<cl_ulong>(
"im"), k.expr<cl_ulong>(
"valIdx + c" ))<<
";\n";
138 boost::compute::kernel kernel = k.compile(results_unreg.queue().get_context());
141 std::size_t stride = (pointsPerImage == points_unreg.size1())? 0: pointsPerImage;
142 kernel.set_arg(height_index, shape[0]);
143 kernel.set_arg(width_index, shape[1]);
144 kernel.set_arg(depth_index, shape[2]);
145 kernel.set_arg(stride_index, stride);
146 kernel.set_arg(numPoints_index, pointsPerImage);
148 std::size_t global_work_size[2] = {imageDerivatives_unreg.size1() * 1, shape[2]};
149 std::size_t local_work_size[2] = {1,shape[2]};
150 results_unreg.queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);