Reorder.h
Go to the documentation of this file.
1#ifndef SHARK_CORE_IMAGE_GPU_REORDER_H
2#define SHARK_CORE_IMAGE_GPU_REORDER_H
3
4#include <shark/LinAlg/Base.h>
6#include <shark/Core/Shape.h>
7namespace shark{
8
9namespace image{
10template<class T>
12 blas::dense_vector_adaptor<T const, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg,
13 blas::dense_vector_adaptor<T, blas::continuous_dense_tag, blas::gpu_tag> outputs_unreg,
14 std::size_t size[4],
15 std::size_t stride[4]
16){
17 SIZE_CHECK(inputs_unreg.size() == outputs_unreg.size());
18 SIZE_CHECK(inputs_unreg.size() == size[0]*size[1]*size[2]*size[3]);
19
20 blas::gpu::detail::meta_kernel k("shark_reorder");
21 std::size_t size_index = k.add_arg<boost::compute::uint4_>("size");
22 std::size_t stride_index = k.add_arg<boost::compute::uint4_>("stride");
23 auto inputs = k.register_args(to_functor(inputs_unreg));
24 auto outputs = k.register_args(to_functor(outputs_unreg));
25
26 k << "const ulong id = get_global_id(0);\n";
27 k << "const ulong i0 = id / size.s1;\n";
28 k << "const ulong i1 = id % size.s1;\n";
29 //obtain base index for input and output
30 k << "const ulong startInput = i0 * stride.s0 + i1 * stride.s1;\n";
31 k << "ulong startOutput = (i0 * size.s1 + i1) * size.s2 * size.s3;\n";
32
33 k << "for(ulong i2 = get_local_id(1); i2 < size.s2; i2 += get_local_size(1)){;\n";
34 k << " for(ulong i3 = get_local_id(2); i3 < size.s3; i3 += get_local_size(2)){;\n";
35 k << " ulong indexIn = startInput + stride.s2 * i2 + stride.s3 * i3;\n";
36 k << " ulong indexOut = startOutput + size.s3 * i2 + i3;\n";
37 k << " " << outputs(k.expr<cl_ulong>("indexOut"))<<" = "<<inputs(k.expr<cl_ulong>("indexIn"))<<";\n";
38 k << " }\n";
39 k << "}\n";
40
41 //compile kernel
42 boost::compute::kernel kernel = k.compile(outputs_unreg.queue().get_context());
43
44 //enqueue kernel with kernel args
45 kernel.set_arg(size_index, boost::compute::uint4_({unsigned(size[0]),unsigned(size[1]),unsigned(size[2]),unsigned(size[3])}));
46 kernel.set_arg(stride_index, boost::compute::uint4_({unsigned(stride[0]),unsigned(stride[1]),unsigned(stride[2]),unsigned(stride[3])}));
47
48
49 std::size_t local_work_size[3] = {1, 8, 4};
50 std::size_t global_work_size[3] = {size[0] * size[1], local_work_size[1], local_work_size[2] };
51 outputs_unreg.queue().enqueue_nd_range_kernel(kernel, 3, nullptr, global_work_size, local_work_size);
52}
53
54}}
55
56#endif