dot.hpp
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief -
6 *
7 * \author O. Krause
8 * \date 2016
9 *
10 *
11 * \par Copyright 1995-2015 Shark Development Team
12 *
13 * <BR><HR>
14 * This file is part of Shark.
15 * <http://image.diku.dk/shark/>
16 *
17 * Shark is free software: you can redistribute it and/or modify
18 * it under the terms of the GNU Lesser General Public License as published
19 * by the Free Software Foundation, either version 3 of the License, or
20 * (at your option) any later version.
21 *
22 * Shark is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU Lesser General Public License for more details.
26 *
27 * You should have received a copy of the GNU Lesser General Public License
28 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29 *
30 */
31//===========================================================================
32#ifndef REMORA_KERNELS_CLBLAS_DOT_HPP
33#define REMORA_KERNELS_CLBLAS_DOT_HPP
34
35#include "../../expression_types.hpp"
36#include "../../detail/traits.hpp"
37#include <boost/compute/container/array.hpp>
38#include <boost/compute/algorithm/copy_n.hpp>
39
40namespace remora{namespace bindings {
41
42template <typename VectorX, typename VectorY, class result_type>
43void dot(
44 vector_expression<VectorX, gpu_tag> const& x_unreg,
45 vector_expression<VectorY, gpu_tag> const& y_unreg,
46 result_type& result,
47 dense_tag,
48 dense_tag
49){
50 auto& queue = x_unreg().queue();
51 gpu::detail::meta_kernel k("blas_vector_dot");
52 std::size_t size_index = k.add_arg<std::size_t>("size");
53 auto x = k.register_args(to_functor(x_unreg));
54 auto y = k.register_args(to_functor(y_unreg));
55
56 boost::compute::array<result_type,1> device_result;
57 auto exprSubFold = k.expr<result_type>("subfold[get_local_id(0)]");
58 k << "__local " <<k.decl<result_type>("subfold")<< "[TILE_DIM];\n";
59 k << exprSubFold<<" = 0;\n";
60 k << "for(uint i = get_local_id(0); i < size; i += TILE_DIM){\n ";
61 k << exprSubFold << "+=" << x(k.expr<cl_uint>("i"))<<'*'<<y(k.expr<cl_uint>("i"))<<";\n";
62 k << "}\n";
63 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing
64 //sum up the rows
65 k << "if(get_local_id(0) == 0){\n";
66 k << " for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
67 k << " subfold[0] +=subfold[i];\n";
68 k << " }\n ";
69 k << device_result.begin()[0]<< "= subfold[0];\n";
70 k << "}\n";
71
72 std::size_t TILE_DIM = 32;
73 boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32");
74 kernel.set_arg(size_index, x_unreg().size());
75
76 std::size_t global_work_size[1] = {TILE_DIM};
77 std::size_t local_work_size[1] = {TILE_DIM};
78 queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size);
79 boost::compute::copy_n(device_result.begin(), 1, &result, queue);
80}
81
82}}
83#endif