vector_fold.hpp
Go to the documentation of this file.
1/*!
2 * \brief kernels for folding kernels with openCL
3 *
4 * \author O. Krause
5 * \date 2016
6 *
7 *
8 * \par Copyright 1995-2015 Shark Development Team
9 *
10 * <BR><HR>
11 * This file is part of Shark.
12 * <http://image.diku.dk/shark/>
13 *
14 * Shark is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License as published
16 * by the Free Software Foundation, either version 3 of the License, or
17 * (at your option) any later version.
18 *
19 * Shark is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Lesser General Public License for more details.
23 *
24 * You should have received a copy of the GNU Lesser General Public License
25 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
26 *
27 */
28#ifndef REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
29#define REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
30
31#include "../../expression_types.hpp"
32#include "../../detail/traits.hpp"
33#include <boost/compute/container/array.hpp>
34#include <boost/compute/algorithm/copy_n.hpp>
35namespace remora{namespace bindings{
36
37template<class F, class V>
38void vector_fold(vector_expression<V, gpu_tag> const& v_unreg, typename F::result_type& value, dense_tag) {
39 if(v_unreg().size() == 0) return;
40 auto& queue = v_unreg().queue();
41 typedef typename F::result_type value_type;
42 gpu::detail::meta_kernel k("blas_vector_fold");
43 std::size_t size_index = k.add_arg<std::size_t>("size");
44 auto v = k.register_args(to_functor(v_unreg));
45 auto f = k.register_args(F());
46
47 boost::compute::array<value_type,1> device_result;
48 boost::compute::copy_n(&value, 1, device_result.begin(), queue);
49 auto exprSubFold = k.expr<value_type>("subfold[get_local_id(0)]");
50 k << "__local " <<k.decl<value_type>("subfold")<< "[TILE_DIM];\n";
51 k << exprSubFold<<" = "<<v(k.expr<cl_uint>("min(size-1,get_local_id(0))"))<<";\n";
52 k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n ";
53 k << exprSubFold << '=' << f(exprSubFold,v(k.expr<cl_uint>("i")))<<";\n";
54 k << "}\n";
55 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing
56 //sum up the rows
57 k << "if(get_local_id(0) == 0){\n";
58 k << " for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
59 k << " subfold[0] =" << f(k.expr<value_type>("subfold[0]"),k.expr<value_type>("subfold[i]"))<<";\n";
60 k << " }\n ";
61 k << device_result.begin()[0]<< "= subfold[0];\n";
62 k << "}\n";
63
64 std::size_t TILE_DIM = 32;
65 boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32");
66 kernel.set_arg(size_index, v_unreg().size());
67
68 std::size_t global_work_size[1] = {TILE_DIM};
69 std::size_t local_work_size[1] = {TILE_DIM};
70 queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size);
71 boost::compute::copy_n(device_result.begin(), 1, &value, queue);
72}
73
74
75}}
76#endif