32#ifndef REMORA_KERNELS_CLBLAS_DOT_HPP
33#define REMORA_KERNELS_CLBLAS_DOT_HPP
35#include "../../expression_types.hpp"
36#include "../../detail/traits.hpp"
37#include <boost/compute/container/array.hpp>
38#include <boost/compute/algorithm/copy_n.hpp>
40namespace remora{
namespace bindings {
42template <
typename VectorX,
typename VectorY,
class result_type>
44 vector_expression<VectorX, gpu_tag>
const& x_unreg,
45 vector_expression<VectorY, gpu_tag>
const& y_unreg,
50 auto& queue = x_unreg().queue();
51 gpu::detail::meta_kernel k(
"blas_vector_dot");
52 std::size_t size_index = k.add_arg<std::size_t>(
"size");
53 auto x = k.register_args(to_functor(x_unreg));
54 auto y = k.register_args(to_functor(y_unreg));
56 boost::compute::array<result_type,1> device_result;
57 auto exprSubFold = k.expr<result_type>(
"subfold[get_local_id(0)]");
58 k <<
"__local " <<k.decl<result_type>(
"subfold")<<
"[TILE_DIM];\n";
59 k << exprSubFold<<
" = 0;\n";
60 k <<
"for(uint i = get_local_id(0); i < size; i += TILE_DIM){\n ";
61 k << exprSubFold <<
"+=" << x(k.expr<cl_uint>(
"i"))<<
'*'<<y(k.expr<cl_uint>(
"i"))<<
";\n";
63 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
65 k <<
"if(get_local_id(0) == 0){\n";
66 k <<
" for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
67 k <<
" subfold[0] +=subfold[i];\n";
69 k << device_result.begin()[0]<<
"= subfold[0];\n";
72 std::size_t TILE_DIM = 32;
73 boost::compute::kernel kernel = k.compile(queue.get_context(),
"-DTILE_DIM=32");
74 kernel.set_arg(size_index, x_unreg().size());
76 std::size_t global_work_size[1] = {TILE_DIM};
77 std::size_t local_work_size[1] = {TILE_DIM};
78 queue.enqueue_nd_range_kernel(kernel, 1,
nullptr, global_work_size, local_work_size);
79 boost::compute::copy_n(device_result.begin(), 1, &result, queue);