28#ifndef REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
29#define REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
31#include "../../expression_types.hpp"
32#include "../../detail/traits.hpp"
35namespace remora{
namespace bindings{
43template<
class F,
class M,
class Orientation>
45 matrix_expression<M, gpu_tag>& m_unreg,
49 gpu::detail::meta_kernel k(
"blas_matrix_apply_dense");
51 auto m = k.register_args(to_functor(m_unreg));
52 auto f = k.register_args(f_unreg);
55 k<<m(k.get_global_id(0),k.get_global_id(1))<<
" = " << f(m(k.get_global_id(0),k.get_global_id(1)))<<
";";
56 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context());
58 std::size_t global_work_size[2] = {m_unreg().size1(), m_unreg().size2()};
59 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size,
nullptr);
67template<
class F,
class M,
class Orientation>
69 matrix_expression<M, gpu_tag>& m,
70 typename M::value_type t,
73 static_assert(std::is_base_of<dense_tag, typename M::storage_type::storage_tag>::value,
"target must have dense storage for assignment");
74 auto f = device_traits<gpu_tag>::make_bind_second(F(), t);
83template<
class F,
class M,
class E>
84void matrix_assign_functor(
85 matrix_expression<M, gpu_tag>& m_unreg,
86 matrix_expression<E, gpu_tag>
const& e_unreg,
88 row_major, row_major ,dense_tag, dense_tag
91 gpu::detail::meta_kernel k(
"blas_matrix_assign");
92 auto m = k.register_args(to_functor(m_unreg));
93 auto e = k.register_args(to_functor(e_unreg));
94 auto f = k.register_args(f_unreg);
96 auto id0 = k.expr<cl_uint>(
"get_global_id(0)");
97 auto id1 = k.expr<cl_uint>(
"get_global_id(1)");
98 k<< m(id0,id1) <<
"=" << f(m(id0,id1),e(id0,id1))<<
";\n";
100 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context());
101 std::size_t global_work_size[2] = {m_unreg().size1(), m_unreg().size2()};
102 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size,
nullptr);
106template<
class F,
class M,
class E>
107void matrix_assign_functor(
108 matrix_expression<M, gpu_tag>& m_unreg,
109 matrix_expression<E, gpu_tag>
const& e_unreg,
111 row_major, column_major ,dense_tag, dense_tag
113 typedef typename M::value_type value_type;
114 std::size_t TILE_DIM = 32;
115 char const* options =
"-DTILE_DIM=32ul";
120 std::size_t BLOCK_COLS = 8;
123 gpu::detail::meta_kernel k(
"blas_matrix_assign_row_col");
124 auto m = k.register_args(to_functor(m_unreg));
125 auto e = k.register_args(to_functor(e_unreg));
126 auto f = k.register_args(f_unreg);
131 std::size_t size1_index = k.add_arg<std::size_t>(
"size1");
132 std::size_t size2_index = k.add_arg<std::size_t>(
"size2");
133 k <<
"__local " <<k.decl<value_type>(
"tile")<<
"[TILE_DIM][TILE_DIM+2];\n";
134 k <<
"uint base_row = get_group_id(0) * TILE_DIM;\n";
135 k <<
"uint base_col = get_group_id(1) * TILE_DIM;\n";
139 k <<
"uint maxDim1 = min(size1-base_row,TILE_DIM);\n";
140 k <<
"uint maxDim2 = min(size2-base_col,TILE_DIM);\n";
141 k <<
"for(uint i = get_local_id(1) ; i < maxDim2 && get_local_id(0) < maxDim1; i += get_local_size(1)){\n";
142 auto row_exp = k.expr<cl_uint>(
"(base_row+get_local_id(0))");
143 auto col_exp = k.expr<cl_uint>(
"(base_col+i)");
144 k <<
" tile[get_local_id(0)][i] =" << e(row_exp, col_exp)<<
";\n";
146 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
149 k <<
"for(uint i = get_local_id(1); i < maxDim1 && get_local_id(0) < maxDim2; i += get_local_size(1)){\n";
150 auto target = m(k.expr<cl_uint>(
"(base_row + i)"), k.expr<cl_uint>(
"(base_col + get_local_id(0))"));
151 k << target <<
" = " <<f(target, k.expr<cl_uint>(
"tile[i][get_local_id(0)]"))<<
";\n";
156 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context(), options);
159 kernel.set_arg(size1_index, m_unreg().size1());
160 kernel.set_arg(size2_index, m_unreg().size2());
161 std::size_t global_work_size[2] = {(m_unreg().size1()+TILE_DIM-1) / TILE_DIM * TILE_DIM, (m_unreg().size2()+TILE_DIM-1) / TILE_DIM * BLOCK_COLS };
162 std::size_t local_work_size[2] = {TILE_DIM, BLOCK_COLS};
163 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,
nullptr, global_work_size, local_work_size);
170template<
class M,
class E>
172 matrix_expression<M, gpu_tag> &m,
173 matrix_expression<E, gpu_tag>
const& e,
174 row_major o, row_major,dense_tag t, dense_tag
176 matrix_assign_functor(m, e, device_traits<gpu_tag>::right_arg<typename E::value_type>(), o, o, t, t);
180template<
class M,
class E>
182 matrix_expression<M, gpu_tag> &m,
183 matrix_expression<E, gpu_tag>
const& e,
184 row_major o1, column_major o2,dense_tag t, dense_tag
186 matrix_assign_functor(m, e, device_traits<gpu_tag>::right_arg<typename E::value_type>(), o1, o2, t, t);