fold_rows.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Folds the rows of a row-major or column major matrix.
5 *
6 * \author O. Krause
7 * \date 2018
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP
32#define REMORA_KERNELS_CLBLAS_FOLD_ROWS_HPP
33
34#include "../../expression_types.hpp"
35#include "../../detail/traits.hpp"
36#include <boost/compute/functional/operator.hpp> //for multiplies
37
38namespace remora{namespace bindings{
39
40template<class F, class G, class M, class V, class Orientation>
41void fold_rows(
42 matrix_expression<M, gpu_tag> const& A_unreg,
43 vector_expression<V, gpu_tag>& v_unreg,
44 F f_unreg,
45 G g_unreg,
46 Orientation
47){
48 typedef typename V::value_type value_type;
49 gpu::detail::meta_kernel k("remora_fold_rows");
50 std::size_t size1_index = k.add_arg<std::size_t>("size1");
51 std::size_t size2_index = k.add_arg<std::size_t>("size2");
52 auto A = k.register_args(to_functor(A_unreg));
53 auto v = k.register_args(to_functor(v_unreg));
54 auto f = k.register_args(f_unreg);
55 auto g = k.register_args(g_unreg);
56 //read all tiles in the assigned rows and sum them up
57 k << "__local " <<k.decl<value_type>("folds")<< "[TILE_DIM][TILE_DIM+1];\n";
58 k << "ulong rowid = get_global_id(0);\n";
59 k << "ulong colid = get_global_id(1);\n";
60 k << "if(rowid < size1 && colid < size2){\n"; //can not compute rows/columns that are infeasible
61 //note: we can not simply step out here as we must ensure that all threads get to the barrier(...)
62 auto colid = k.expr<cl_ulong>("colid");
63 auto rowid = k.expr<cl_ulong>("rowid");
64 auto entry = k.expr<cl_ulong>("folds[get_local_id(0)][get_local_id(1)]");
65 k << " "<<entry <<" = "<< A(rowid,colid) <<";\n";
66 k << " colid += TILE_DIM;\n";
67 k << " for(; colid < size2; colid += TILE_DIM){\n";
68 k << " "<< entry << " = " << f(entry, A(rowid,colid))<<";\n";
69 k << " }\n";
70 k << "}\n";
71 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with folding the columns
72 //final fold, just the threads in the first row compute this
73 k << "if(get_local_id(1) == 0 && rowid < size1){\n";
74 k << " for(uint i = 1 ; i < min(TILE_DIM, size2); ++i){\n";
75 k << " " << entry <<" = "<< f(entry, k.expr<cl_ulong>("folds[get_local_id(0)][i]"))<<";\n";
76 k << " }\n";
77 k << v(rowid) << "+= " <<g(k.expr<value_type>("folds[get_local_id(0)][0]"))<<";\n";
78 k<< "}\n";
79 //create source
80
81 std::size_t TILE_DIM = 8;
82 char const* options ="-DTILE_DIM=8ul";
83 boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context(), options);
84 //enqueue kernel
85 kernel.set_arg(size1_index, A_unreg().size1());
86 kernel.set_arg(size2_index, A_unreg().size2());
87
88 std::size_t global_size[2] = {
89 ((A_unreg().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
90 TILE_DIM
91 };
92 std::size_t local_size[2] = {TILE_DIM, TILE_DIM};
93 v_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_size, local_size);
94}
95
96
97}}
98
99#endif