31#ifndef REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP 
   32#define REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP 
   34#include "../../expression_types.hpp" 
   35#include "../../detail/traits.hpp" 
   36#include <boost/compute/functional/operator.hpp>  
   38namespace remora{
namespace bindings{
 
   40template<
class F, 
class G, 
class M, 
class V, 
class Orientation>
 
   42    matrix_expression<M, gpu_tag> 
const& A_unreg, 
 
   43    vector_expression<V, gpu_tag>& v_unreg,
 
   48    typedef typename V::value_type value_type;
 
   49    gpu::detail::meta_kernel k(
"remora_fold_rows");
 
   50    std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
 
   51    std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
 
   52    auto A = k.register_args(to_functor(A_unreg));
 
   53    auto v = k.register_args(to_functor(v_unreg));
 
   54    auto f = k.register_args(f_unreg);
 
   55    auto g = k.register_args(g_unreg);
 
   57    k << 
"__local " <<k.decl<value_type>(
"folds")<< 
"[TILE_DIM][TILE_DIM+1];\n";
 
   58    k << 
"ulong rowid = get_global_id(0);\n";
 
   59    k << 
"ulong colid = get_global_id(1);\n";
 
   60    k << 
"if(rowid < size1 && colid < size2){\n"; 
 
   62    auto colid = k.expr<cl_ulong>(
"colid");
 
   63    auto rowid = k.expr<cl_ulong>(
"rowid");
 
   64    auto entry = k.expr<cl_ulong>(
"folds[get_local_id(0)][get_local_id(1)]");
 
   65    k << 
"  "<<entry <<
" = "<< A(rowid,colid) <<
";\n";
 
   66    k << 
"  colid += TILE_DIM;\n";
 
   67    k << 
"  for(; colid < size2; colid += TILE_DIM){\n";
 
   68    k << 
"      "<< entry << 
" = " << f(entry, A(rowid,colid))<<
";\n";
 
   71    k << 
"barrier(CLK_LOCAL_MEM_FENCE);\n";
 
   73    k << 
"if(get_local_id(1) == 0 && rowid < size1){\n";
 
   74    k << 
"    for(uint i = 1 ; i < min(TILE_DIM, size2); ++i){\n";
 
   75    k << 
"        " << entry <<
" = "<< f(entry, k.expr<cl_ulong>(
"folds[get_local_id(0)][i]"))<<
";\n";
 
   77    k << v(rowid) << 
"+= " <<g(k.expr<value_type>(
"folds[get_local_id(0)][0]"))<<
";\n";
 
   81    std::size_t TILE_DIM = 8;
 
   82    char const* options =
"-DTILE_DIM=8ul";
 
   83    boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context(), options);
 
   85    kernel.set_arg(size1_index, A_unreg().size1());
 
   86    kernel.set_arg(size2_index, A_unreg().size2());
 
   88    std::size_t global_size[2] = {
 
   89        ((A_unreg().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
 
   92    std::size_t local_size[2] = {TILE_DIM, TILE_DIM};
 
   93    v_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_size, local_size);