gemv.hpp
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief -
6 *
7 * \author O. Krause
8 * \date 2016
9 *
10 *
11 * \par Copyright 1995-2015 Shark Development Team
12 *
13 * <BR><HR>
14 * This file is part of Shark.
15 * <http://image.diku.dk/shark/>
16 *
17 * Shark is free software: you can redistribute it and/or modify
18 * it under the terms of the GNU Lesser General Public License as published
19 * by the Free Software Foundation, either version 3 of the License, or
20 * (at your option) any later version.
21 *
22 * Shark is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU Lesser General Public License for more details.
26 *
27 * You should have received a copy of the GNU Lesser General Public License
28 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29 *
30 */
31//===========================================================================
32#ifndef REMORA_KERNELS_GPU_GEMV_HPP
33#define REMORA_KERNELS_GPU_GEMV_HPP
34
35#include "../../expression_types.hpp"
36#include "../../detail/traits.hpp"
37#include <boost/compute/functional/operator.hpp> //for multiplies
38
39namespace remora{ namespace kernels{
40
41// v <- v + alpha * A * x
42template <typename MatA, typename VecX, typename VecV>
43void gemv(
44 matrix_expression<MatA, gpu_tag> const& A_unreg,
45 vector_expression<VecX, gpu_tag> const& x_unreg,
46 vector_expression<VecV, gpu_tag>& v_unreg,
47 typename VecV::value_type const& alpha
48) {
49 REMORA_SIZE_CHECK(A_unreg().size1() == v_unreg().size());
50 REMORA_SIZE_CHECK(A_unreg().size2() == x_unreg().size());
51
52
53 typedef typename VecV::value_type value_type;
54 boost::compute::multiplies<value_type> prod;
55 gpu::detail::meta_kernel k("blas_gemv");
56 std::size_t alpha_index = k.add_arg<value_type>("alpha");
57 std::size_t size1_index = k.add_arg<std::size_t>("size1");
58 std::size_t size2_index = k.add_arg<std::size_t>("size2");
59 auto A = k.register_args(to_functor(A_unreg));
60 auto x = k.register_args(to_functor(x_unreg));
61 auto v = k.register_args(to_functor(v_unreg));
62 //read all tiles in the assigned rows and compute the inner product
63 k << "__local " <<k.decl<value_type>("results")<< "[TILE_DIM][TILE_DIM+2];";
64 k << "uint rowid = get_global_id(0);";
65 k << "results[get_local_id(0)][get_local_id(1)] = 0.0;";
66 k << "for(uint i = get_local_id(1) ; i < size2 && rowid < size1; i += TILE_DIM){";
67 auto exprRow = k.expr<cl_uint>("rowid");
68 auto exprCol = k.expr<cl_uint>("i");
69 k<< " results[get_local_id(0)][get_local_id(1)] += "<< prod(A(exprRow,exprCol),x(exprCol))<<";";
70 k<<'}';
71 k << "barrier(CLK_LOCAL_MEM_FENCE);";//wait until all threads are done with computing
72 //sum up the rows
73 k << "if(get_local_id(1) == 0 && rowid < size1){";
74 k << " for(uint i = 1 ; i < TILE_DIM; ++i){";
75 k << " results[get_local_id(0)][0] +=results[get_local_id(0)][i];";
76 k << " }";
77 k << v(exprRow) << "+= alpha * results[get_local_id(0)][0];";
78 k<< "}";
79 //create source
80
81 std::size_t TILE_DIM = 16;
82 char const* options ="-DTILE_DIM=16";
83 boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context(), options);
84 //enqueue kernel
85 kernel.set_arg(alpha_index, alpha);
86 kernel.set_arg(size1_index, A_unreg().size1());
87 kernel.set_arg(size2_index, A_unreg().size2());
88
89 std::size_t global_work_size[2] = {
90 ((A_unreg().size1()+TILE_DIM-1)/TILE_DIM) * TILE_DIM,
91 TILE_DIM
92 };
93 std::size_t local_work_size[2] = {TILE_DIM,TILE_DIM};
94 v_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
95}
96
97}}
98
99#endif