28#ifndef REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
29#define REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
31#include "../../expression_types.hpp"
32#include "../../detail/traits.hpp"
33#include <boost/compute/container/array.hpp>
34#include <boost/compute/algorithm/copy_n.hpp>
35namespace remora{
namespace bindings{
37template<
class F,
class V>
38void vector_fold(vector_expression<V, gpu_tag>
const& v_unreg,
typename F::result_type& value, dense_tag) {
39 if(v_unreg().size() == 0)
return;
40 auto& queue = v_unreg().queue();
41 typedef typename F::result_type value_type;
42 gpu::detail::meta_kernel k(
"blas_vector_fold");
43 std::size_t size_index = k.add_arg<std::size_t>(
"size");
44 auto v = k.register_args(to_functor(v_unreg));
45 auto f = k.register_args(F());
47 boost::compute::array<value_type,1> device_result;
48 boost::compute::copy_n(&value, 1, device_result.begin(), queue);
49 auto exprSubFold = k.expr<value_type>(
"subfold[get_local_id(0)]");
50 k <<
"__local " <<k.decl<value_type>(
"subfold")<<
"[TILE_DIM];\n";
51 k << exprSubFold<<
" = "<<v(k.expr<cl_uint>(
"min(size-1,get_local_id(0))"))<<
";\n";
52 k <<
"for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n ";
53 k << exprSubFold <<
'=' << f(exprSubFold,v(k.expr<cl_uint>(
"i")))<<
";\n";
55 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
57 k <<
"if(get_local_id(0) == 0){\n";
58 k <<
" for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
59 k <<
" subfold[0] =" << f(k.expr<value_type>(
"subfold[0]"),k.expr<value_type>(
"subfold[i]"))<<
";\n";
61 k << device_result.begin()[0]<<
"= subfold[0];\n";
64 std::size_t TILE_DIM = 32;
65 boost::compute::kernel kernel = k.compile(queue.get_context(),
"-DTILE_DIM=32");
66 kernel.set_arg(size_index, v_unreg().size());
68 std::size_t global_work_size[1] = {TILE_DIM};
69 std::size_t local_work_size[1] = {TILE_DIM};
70 queue.enqueue_nd_range_kernel(kernel, 1,
nullptr, global_work_size, local_work_size);
71 boost::compute::copy_n(device_result.begin(), 1, &value, queue);