matrix_assign.hpp
Go to the documentation of this file.
1/*!
2 * \brief Kernels for matrix-expression assignments on the gpu
3 *
4 * \author O. Krause
5 * \date 2016
6 *
7 *
8 * \par Copyright 1995-2015 Shark Development Team
9 *
10 * <BR><HR>
11 * This file is part of Shark.
12 * <http://image.diku.dk/shark/>
13 *
14 * Shark is free software: you can redistribute it and/or modify
15 * it under the terms of the GNU Lesser General Public License as published
16 * by the Free Software Foundation, either version 3 of the License, or
17 * (at your option) any later version.
18 *
19 * Shark is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU Lesser General Public License for more details.
23 *
24 * You should have received a copy of the GNU Lesser General Public License
25 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
26 *
27 */
28#ifndef REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
29#define REMORA_KERNELS_CLBLAS_MATRIX_ASSIGN_HPP
30
31#include "../../expression_types.hpp"
32#include "../../detail/traits.hpp"
33
34#include <iostream>
35namespace remora{namespace bindings{
36
37
38//////////////////////////////////////////////////////
39////Apply function elementwise to Matrix
40/////////////////////////////////////////////////////
41
42// Explicitly iterating row major
43template<class F, class M, class Orientation>
44void matrix_apply(
45 matrix_expression<M, gpu_tag>& m_unreg,
46 F const& f_unreg,
47 Orientation
48){
49 gpu::detail::meta_kernel k("blas_matrix_apply_dense");
50
51 auto m = k.register_args(to_functor(m_unreg));
52 auto f = k.register_args(f_unreg);
53
54 //create source
55 k<<m(k.get_global_id(0),k.get_global_id(1))<<" = " << f(m(k.get_global_id(0),k.get_global_id(1)))<<";";
56 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context());
57 //enqueue kernel
58 std::size_t global_work_size[2] = {m_unreg().size1(), m_unreg().size2()};
59 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, nullptr);
60}
61
62//////////////////////////////////////////////////////
63////Scalar Assignment to Matrix
64/////////////////////////////////////////////////////
65
66// Explicitly iterating row major
67template<class F, class M, class Orientation>
68void matrix_assign(
69 matrix_expression<M, gpu_tag>& m,
70 typename M::value_type t,
71 Orientation o
72){
73 static_assert(std::is_base_of<dense_tag, typename M::storage_type::storage_tag>::value, "target must have dense storage for assignment");
74 auto f = device_traits<gpu_tag>::make_bind_second(F(), t);
75 matrix_apply(m,f,o);
76}
77
78
79///////////////////////////////////////////////////////////////////////////////////////////
80//////Matrix Assignment With Functor implementing +=,-=...
81///////////////////////////////////////////////////////////////////////////////////////////
82
83template<class F, class M, class E>
84void matrix_assign_functor(
85 matrix_expression<M, gpu_tag>& m_unreg,
86 matrix_expression<E, gpu_tag> const& e_unreg,
87 F f_unreg,
88 row_major, row_major ,dense_tag, dense_tag
89){
90 //create source
91 gpu::detail::meta_kernel k("blas_matrix_assign");
92 auto m = k.register_args(to_functor(m_unreg));
93 auto e = k.register_args(to_functor(e_unreg));
94 auto f = k.register_args(f_unreg);
95
96 auto id0 = k.expr<cl_uint>("get_global_id(0)");
97 auto id1 = k.expr<cl_uint>("get_global_id(1)");
98 k<< m(id0,id1) << "=" << f(m(id0,id1),e(id0,id1))<<";\n";
99 //enqueue kernel
100 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context());
101 std::size_t global_work_size[2] = {m_unreg().size1(), m_unreg().size2()};
102 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, nullptr);
103}
104
105//dense-dense case row-major, column-major
106template<class F,class M, class E>
107void matrix_assign_functor(
108 matrix_expression<M, gpu_tag>& m_unreg,
109 matrix_expression<E, gpu_tag> const& e_unreg,
110 F f_unreg,
111 row_major, column_major ,dense_tag, dense_tag
112) {
113 typedef typename M::value_type value_type;
114 std::size_t TILE_DIM = 32;
115 char const* options ="-DTILE_DIM=32ul";
116 //There are usually not enough parallel worker threads in a local group
117 //to fill a tile. Therefore every parallel threads reads several elements.
118 //BLOCK_COLS are the number of parallel threads reading a column
119 //and must be a divisor of TILE_DIM
120 std::size_t BLOCK_COLS = 8;
121
122 //create source
123 gpu::detail::meta_kernel k("blas_matrix_assign_row_col");
124 auto m = k.register_args(to_functor(m_unreg));
125 auto e = k.register_args(to_functor(e_unreg));
126 auto f = k.register_args(f_unreg);
127 //create local memory. we first copy a tile in local
128 // memory which gets the orientation right. Then we copy the tile
129 //to the target
130 // TILE_DIM+1 is here to avoid bank conflicts in local memory
131 std::size_t size1_index = k.add_arg<std::size_t>("size1");
132 std::size_t size2_index = k.add_arg<std::size_t>("size2");
133 k << "__local " <<k.decl<value_type>("tile")<< "[TILE_DIM][TILE_DIM+2];\n";
134 k << "uint base_row = get_group_id(0) * TILE_DIM;\n";
135 k << "uint base_col = get_group_id(1) * TILE_DIM;\n";
136 //copy indices, into local memory, note the change of direction
137 //also note that if we are on the boundary, the tile
138 // might be largerthan the amount of values to read
139 k << "uint maxDim1 = min(size1-base_row,TILE_DIM);\n";
140 k << "uint maxDim2 = min(size2-base_col,TILE_DIM);\n";
141 k << "for(uint i = get_local_id(1) ; i < maxDim2 && get_local_id(0) < maxDim1; i += get_local_size(1)){\n";
142 auto row_exp = k.expr<cl_uint>("(base_row+get_local_id(0))");
143 auto col_exp = k.expr<cl_uint>("(base_col+i)");
144 k << " tile[get_local_id(0)][i] =" << e(row_exp, col_exp)<<";\n";
145 k << "}\n";
146 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with copying
147 // write output from local memory, again be sure that
148 // we do not write outside the feasible area
149 k << "for(uint i = get_local_id(1); i < maxDim1 && get_local_id(0) < maxDim2; i += get_local_size(1)){\n";
150 auto target = m(k.expr<cl_uint>("(base_row + i)"), k.expr<cl_uint>("(base_col + get_local_id(0))"));
151 k << target << " = " <<f(target, k.expr<cl_uint>("tile[i][get_local_id(0)]"))<<";\n";
152 k << "}\n";
153
154 //compile kernel
155
156 boost::compute::kernel kernel = k.compile(m_unreg().queue().get_context(), options);
157
158 //enqueue kernel
159 kernel.set_arg(size1_index, m_unreg().size1());
160 kernel.set_arg(size2_index, m_unreg().size2());
161 std::size_t global_work_size[2] = {(m_unreg().size1()+TILE_DIM-1) / TILE_DIM * TILE_DIM, (m_unreg().size2()+TILE_DIM-1) / TILE_DIM * BLOCK_COLS };
162 std::size_t local_work_size[2] = {TILE_DIM, BLOCK_COLS};
163 m_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
164}
165
166/////////////////////////////////////////////////////////////////
167//////Matrix Assignment implementing op=
168////////////////////////////////////////////////////////////////
169
170template<class M, class E>
171void matrix_assign(
172 matrix_expression<M, gpu_tag> &m,
173 matrix_expression<E, gpu_tag> const& e,
174 row_major o, row_major,dense_tag t, dense_tag
175) {
176 matrix_assign_functor(m, e, device_traits<gpu_tag>::right_arg<typename E::value_type>(), o, o, t, t);
177}
178
179//dense-dense case
180template<class M, class E>
181void matrix_assign(
182 matrix_expression<M, gpu_tag> &m,
183 matrix_expression<E, gpu_tag> const& e,
184 row_major o1, column_major o2,dense_tag t, dense_tag
185) {
186 matrix_assign_functor(m, e, device_traits<gpu_tag>::right_arg<typename E::value_type>(), o1, o2, t, t);
187}
188
189
190}}
191
192#endif