32#ifndef REMORA_KERNELS_GPU_GEMV_HPP
33#define REMORA_KERNELS_GPU_GEMV_HPP
35#include "../../expression_types.hpp"
36#include "../../detail/traits.hpp"
37#include <boost/compute/functional/operator.hpp>
39namespace remora{
namespace kernels{
42template <
typename MatA,
typename VecX,
typename VecV>
44 matrix_expression<MatA, gpu_tag>
const& A_unreg,
45 vector_expression<VecX, gpu_tag>
const& x_unreg,
46 vector_expression<VecV, gpu_tag>& v_unreg,
47 typename VecV::value_type
const& alpha
49 REMORA_SIZE_CHECK(A_unreg().size1() == v_unreg().size());
50 REMORA_SIZE_CHECK(A_unreg().size2() == x_unreg().size());
53 typedef typename VecV::value_type value_type;
54 boost::compute::multiplies<value_type> prod;
55 gpu::detail::meta_kernel k(
"blas_gemv");
56 std::size_t alpha_index = k.add_arg<value_type>(
"alpha");
57 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
58 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
59 auto A = k.register_args(to_functor(A_unreg));
60 auto x = k.register_args(to_functor(x_unreg));
61 auto v = k.register_args(to_functor(v_unreg));
63 k <<
"__local " <<k.decl<value_type>(
"results")<<
"[TILE_DIM][TILE_DIM+2];";
64 k <<
"uint rowid = get_global_id(0);";
65 k <<
"results[get_local_id(0)][get_local_id(1)] = 0.0;";
66 k <<
"for(uint i = get_local_id(1) ; i < size2 && rowid < size1; i += TILE_DIM){";
67 auto exprRow = k.expr<cl_uint>(
"rowid");
68 auto exprCol = k.expr<cl_uint>(
"i");
69 k<<
" results[get_local_id(0)][get_local_id(1)] += "<< prod(A(exprRow,exprCol),x(exprCol))<<
";";
71 k <<
"barrier(CLK_LOCAL_MEM_FENCE);";
73 k <<
"if(get_local_id(1) == 0 && rowid < size1){";
74 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
75 k <<
" results[get_local_id(0)][0] +=results[get_local_id(0)][i];";
77 k << v(exprRow) <<
"+= alpha * results[get_local_id(0)][0];";
81 std::size_t TILE_DIM = 16;
82 char const* options =
"-DTILE_DIM=16";
83 boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context(), options);
85 kernel.set_arg(alpha_index, alpha);
86 kernel.set_arg(size1_index, A_unreg().size1());
87 kernel.set_arg(size2_index, A_unreg().size2());
89 std::size_t global_work_size[2] = {
90 ((A_unreg().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
93 std::size_t local_work_size[2] = {TILE_DIM,TILE_DIM};
94 v_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);