vector_max.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief -
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30#ifndef REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
31#define REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
32
33#include "../../detail/traits.hpp"
34#include "../../expression_types.hpp"
35namespace remora {namespace bindings{
36
37template<class E>
38std::size_t vector_max(vector_expression<E, gpu_tag> const& v_unreg, dense_tag) {
39 if(v_unreg().size() == 0) return 0;
40 auto& queue = v_unreg().queue();
41 typedef typename E::value_type value_type;
42 gpu::detail::meta_kernel k("blas_vector_fold");
43 std::size_t size_index = k.add_arg<std::size_t>("size");
44 auto v = k.register_args(to_functor(v_unreg));
45
46 boost::compute::array<std::size_t,1> device_result;
47 auto exprMax = k.expr<value_type>("maximum[get_local_id(0)]");
48 k << "__local " <<k.decl<value_type>("maximum")<< "[TILE_DIM];\n";
49 k << "__local uint maximum_index[TILE_DIM];\n";
50 k << exprMax<<" = "<<v(k.expr<cl_uint>("min(size-1,get_local_id(0))"))<<";\n";
51 k << "maximum_index[get_local_id(0)] = get_local_id(0);\n";
52 k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n";
53 k << " if( " << exprMax << '<' << v(k.expr<cl_uint>("i"))<<"){\n ";
54 k << exprMax << '=' << v(k.expr<cl_uint>("i"))<<";\n";
55 k << " maximum_index[get_local_id(0)] = i;\n";
56 k << " }\n";
57 k << "}\n";
58 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing
59 //sum up the rows
60 k << "if(get_local_id(0) == 0){\n";
61 k << " for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
62 k << " if( " << exprMax<< '<' << v(k.expr<cl_uint>("i"))<<"){\n";
63 k << " maximum_index[0] = maximum_index[i];\n";
64 k << " maximum[0] = maximum[i];\n";
65 k << " }\n";
66 k << " }\n";
67 k << device_result.begin()[0]<< "= maximum_index[0];\n";
68 k << "}\n";
69
70 std::size_t TILE_DIM = 32;
71 boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32");
72 kernel.set_arg(size_index, v_unreg().size());
73
74 std::size_t global_work_size[1] = {TILE_DIM};
75 std::size_t local_work_size[1] = {TILE_DIM};
76 queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size);
77 std::size_t result;
78 boost::compute::copy_n(device_result.begin(), 1, &result, queue);
79 return result;
80}
81
82
83}}
84#endif