31#ifndef REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP
32#define REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP
34#include "../../expression_types.hpp"
35#include "../../detail/traits.hpp"
36#include <boost/compute/functional/operator.hpp>
38namespace remora{
namespace bindings{
40template<
class F,
class G,
class M,
class V,
class Orientation>
42 matrix_expression<M, gpu_tag>
const& A_unreg,
43 vector_expression<V, gpu_tag>& v_unreg,
48 typedef typename V::value_type value_type;
49 gpu::detail::meta_kernel k(
"remora_fold_rows");
50 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
51 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
52 auto A = k.register_args(to_functor(A_unreg));
53 auto v = k.register_args(to_functor(v_unreg));
54 auto f = k.register_args(f_unreg);
55 auto g = k.register_args(g_unreg);
57 k <<
"__local " <<k.decl<value_type>(
"folds")<<
"[TILE_DIM][TILE_DIM+1];\n";
58 k <<
"ulong rowid = get_global_id(0);\n";
59 k <<
"ulong colid = get_global_id(1);\n";
60 k <<
"if(rowid < size1 && colid < size2){\n";
62 auto colid = k.expr<cl_ulong>(
"colid");
63 auto rowid = k.expr<cl_ulong>(
"rowid");
64 auto entry = k.expr<cl_ulong>(
"folds[get_local_id(0)][get_local_id(1)]");
65 k <<
" "<<entry <<
" = "<< A(rowid,colid) <<
";\n";
66 k <<
" colid += TILE_DIM;\n";
67 k <<
" for(; colid < size2; colid += TILE_DIM){\n";
68 k <<
" "<< entry <<
" = " << f(entry, A(rowid,colid))<<
";\n";
71 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
73 k <<
"if(get_local_id(1) == 0 && rowid < size1){\n";
74 k <<
" for(uint i = 1 ; i < min(TILE_DIM, size2); ++i){\n";
75 k <<
" " << entry <<
" = "<< f(entry, k.expr<cl_ulong>(
"folds[get_local_id(0)][i]"))<<
";\n";
77 k << v(rowid) <<
"+= " <<g(k.expr<value_type>(
"folds[get_local_id(0)][0]"))<<
";\n";
81 std::size_t TILE_DIM = 8;
82 char const* options =
"-DTILE_DIM=8ul";
83 boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context(), options);
85 kernel.set_arg(size1_index, A_unreg().size1());
86 kernel.set_arg(size2_index, A_unreg().size2());
88 std::size_t global_size[2] = {
89 ((A_unreg().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
92 std::size_t local_size[2] = {TILE_DIM, TILE_DIM};
93 v_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_size, local_size);