30#ifndef REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
31#define REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
33#include "../../detail/traits.hpp"
34#include "../../expression_types.hpp"
35namespace remora {
namespace bindings{
38std::size_t vector_max(vector_expression<E, gpu_tag>
const& v_unreg, dense_tag) {
39 if(v_unreg().size() == 0)
return 0;
40 auto& queue = v_unreg().queue();
41 typedef typename E::value_type value_type;
42 gpu::detail::meta_kernel k(
"blas_vector_fold");
43 std::size_t size_index = k.add_arg<std::size_t>(
"size");
44 auto v = k.register_args(to_functor(v_unreg));
46 boost::compute::array<std::size_t,1> device_result;
47 auto exprMax = k.expr<value_type>(
"maximum[get_local_id(0)]");
48 k <<
"__local " <<k.decl<value_type>(
"maximum")<<
"[TILE_DIM];\n";
49 k <<
"__local uint maximum_index[TILE_DIM];\n";
50 k << exprMax<<
" = "<<v(k.expr<cl_uint>(
"min(size-1,get_local_id(0))"))<<
";\n";
51 k <<
"maximum_index[get_local_id(0)] = get_local_id(0);\n";
52 k <<
"for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n";
53 k <<
" if( " << exprMax <<
'<' << v(k.expr<cl_uint>(
"i"))<<
"){\n ";
54 k << exprMax <<
'=' << v(k.expr<cl_uint>(
"i"))<<
";\n";
55 k <<
" maximum_index[get_local_id(0)] = i;\n";
58 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
60 k <<
"if(get_local_id(0) == 0){\n";
61 k <<
" for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
62 k <<
" if( " << exprMax<<
'<' << v(k.expr<cl_uint>(
"i"))<<
"){\n";
63 k <<
" maximum_index[0] = maximum_index[i];\n";
64 k <<
" maximum[0] = maximum[i];\n";
67 k << device_result.begin()[0]<<
"= maximum_index[0];\n";
70 std::size_t TILE_DIM = 32;
71 boost::compute::kernel kernel = k.compile(queue.get_context(),
"-DTILE_DIM=32");
72 kernel.set_arg(size_index, v_unreg().size());
74 std::size_t global_work_size[1] = {TILE_DIM};
75 std::size_t local_work_size[1] = {TILE_DIM};
76 queue.enqueue_nd_range_kernel(kernel, 1,
nullptr, global_work_size, local_work_size);
78 boost::compute::copy_n(device_result.begin(), 1, &result, queue);