28#ifndef REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP
29#define REMORA_KERNELS_CLBLAS_MATRIX_FOLD_HPP
31#include "../../expression_types.hpp"
32#include "../../detail/traits.hpp"
33#include <boost/compute/container/array.hpp>
34#include <boost/compute/algorithm/copy_n.hpp>
35namespace remora{
namespace bindings{
37template<
class F,
class MatA,
class Orientation>
38void matrix_fold(matrix_expression<MatA, gpu_tag>
const& A_unreg,
typename F::result_type& value, Orientation, dense_tag) {
39 auto& queue = A_unreg().queue();
40 typedef typename F::result_type value_type;
41 gpu::detail::meta_kernel k(
"blas_matrix_fold");
42 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
43 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
44 auto A = k.register_args(to_functor(A_unreg));
45 auto f = k.register_args(F());
46 boost::compute::array<value_type,1> device_result;
47 boost::compute::copy_n(&value, 1, device_result.begin(), queue);
48 device_result.front() = value;
51 k <<
"__local " <<k.decl<value_type>(
"subfold")<<
"[TILE_DIM][TILE_DIM+1];";
52 k <<
"subfold[get_local_id(0)][get_local_id(1)] = "<<device_result.begin()[0]<<
';';
53 k <<
"for(uint i = get_local_id(0) ; i < size1; i += TILE_DIM){";
54 k <<
" for(uint j = get_local_id(1) ; j < size2; j += TILE_DIM){";
55 auto exprSubFold = k.expr<value_type>(
"subfold[get_local_id(0)][get_local_id(1)]");
56 k<< exprSubFold <<
'=' << f(exprSubFold,A(k.expr<cl_uint>(
"i"),k.expr<cl_uint>(
"j")))<<
";";
58 k <<
"barrier(CLK_LOCAL_MEM_FENCE);";
60 k <<
"if(get_local_id(0) == 0){";
61 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
62 k <<
" subfold[0][get_local_id(1)] ="
64 k.expr<value_type>(
"subfold[0][get_local_id(1)]"),
65 k.expr<value_type>(
"subfold[i][get_local_id(1)]")
68 k <<
" if(get_local_id(1) == 0){";
69 k <<
" for(uint i = 1 ; i < TILE_DIM; ++i){";
70 k <<
" subfold[0][0] =" << f(k.expr<value_type>(
"subfold[0][0]"),k.expr<value_type>(
"subfold[0][i]"))<<
';';
72 k <<device_result.begin()[0]<<
"= subfold[0][0];";
76 std::size_t TILE_DIM = 1;
77 char const* options =
"-DTILE_DIM=1";
78 boost::compute::kernel kernel = k.compile(queue.get_context(), options);
80 kernel.set_arg(size1_index, A_unreg().size1());
81 kernel.set_arg(size2_index, A_unreg().size2());
83 std::size_t global_work_size[2] = {TILE_DIM,TILE_DIM};
84 std::size_t local_work_size[2] = {TILE_DIM, TILE_DIM};
85 queue.enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);
86 boost::compute::copy_n(device_result.begin(), 1, &value, queue);