trsv.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief -
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_CLBLAS_TRSV_HPP
32#define REMORA_KERNELS_CLBLAS_TRSV_HPP
33
34
35#include "../../detail/traits.hpp"
36#include "../../proxy_expressions.hpp"
37#include "../gemv.hpp"
38#include <boost/compute/functional/operator.hpp> //for multiplies
39
40///solves systems of triangular matrices with one left hand side
41
42namespace remora{namespace bindings {
43struct trsv_kernel{
44 boost::compute::kernel kernel;
45 std::size_t start_index;
46 std::size_t end_index;
47 std::size_t unit_index;
48 std::size_t upper_index;
49};
50//Lower triangular - matrix(row-major)
51template<class MatA, class VecB>
52trsv_kernel createTRSVDiagBlockKernel(
53 matrix_expression<MatA, gpu_tag> const& A_unreg,
54 vector_expression<VecB, gpu_tag> &b_unreg,
55 char const* options
56){
57 typedef typename MatA::value_type value_typeA;
58 typedef typename VecB::value_type value_typeB;
59 boost::compute::multiplies<value_typeB> prod;
60
61 gpu::detail::meta_kernel k("blas_trsv");
62 std::size_t start_index = k.add_arg<std::size_t>("start");//start of block of A
63 std::size_t end_index = k.add_arg<std::size_t>("end");//end of Block of A
64 std::size_t unit_index = k.add_arg<std::size_t>("unit");//whether A is unit triangular
65 std::size_t upper_index = k.add_arg<std::size_t>("upper");//whether A is upper triangular
66 auto A = k.register_args(to_functor(A_unreg));
67 auto b = k.register_args(to_functor(b_unreg));
68 // Local memory to fit a tile of A and the vector B
69 k << "__local " <<k.decl<value_typeA>("Asub")<< "[TILE_SIZE][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
70 k << "__local " <<k.decl<value_typeB>("Bsub")<< "[TILE_SIZE];\n";
71 k << "const ulong numWorkers = get_local_size(0);\n";
72 //ensure we are not reading out of bounds
73 k << "const ulong curTileA = end-start;\n";
74
75 // Load tile of A into local memory
76 k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
77 k << " for(ulong j = 0; j < TILE_SIZE; j++){\n";
78 k << " Asub[i][j] ="<< A(k.expr<cl_ulong>("min(end-1, start + i)"),k.expr<cl_ulong>("min(end-1, start + j)"))<<";\n";
79 k << " }\n";
80 k << "}\n";
81
82
83 // Load Tile of B into local memory, store columns of B as rows
84 k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
85 k << " Bsub[i] ="<< b(k.expr<cl_ulong>("min(end-1,start + i)"))<<";\n";
86 k << "}\n";
87 // Synchronise to make sure everything is loaded
88 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
89
90 // Loop over the values of a single tile
91 //lower-case
92 k << "if(!upper){\n";
93 k << " for(ulong i = 0; i < TILE_SIZE && get_local_id(0) == 0; ++i){\n";
94 k << " if(!unit){Bsub[i] /= Asub[i][i];}\n";
95 k << " for(ulong j = i+1; j < TILE_SIZE; ++j){\n";
96 k << " Bsub[j] -= "<< prod(k.expr<value_typeB>("Bsub[i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
97 k << " }\n";
98 k << " }\n";
99 k << "}else{\n";
100 //upper case
101 k << " for(ulong n = curTileA; n > 0 && get_local_id(0) == 0; --n){\n";
102 k << " ulong i = n-1;\n";
103 k << " if(!unit ){Bsub[i] /= Asub[i][i];}\n";
104 k << " for(ulong j = 0; j < i; j ++){\n";
105 k << " Bsub[j] -= "<< prod(k.expr<value_typeB>("Bsub[i]"), k.expr<value_typeA>("Asub[j][i]"))<<";\n";
106 k << " }\n";
107 k << " }\n";
108 k << "}\n";
109 // Synchronise before continuing
110 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
111 // Store the final results back in B
112 k << "for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
113 k << b(k.expr<cl_ulong>("(start+i)"))<<" = Bsub[i];\n";
114 k << "}\n";
115
116 boost::compute::kernel kernel = k.compile(b_unreg().queue().get_context(), options);
117 return {kernel,start_index,end_index,unit_index,upper_index};
118}
119
120template <typename MatA, typename VecB, class Triangular>
121void trsv_recursive(
122 matrix_expression<MatA, gpu_tag> const& Afull,
123 vector_expression<VecB, gpu_tag> & bfull,
124 trsv_kernel& kernel,
125 std::size_t start,
126 std::size_t end,
127 std::size_t tileSize,
128 std::size_t numWorkers,
129 Triangular t
130){
131
132 std::size_t size = end-start;
133 //if the matrix is small enough call the computation kernel directly for the block
134 if(size <= tileSize){
135 //enqueue kernel with kernel args
136 kernel.kernel.set_arg(kernel.start_index, start);
137 kernel.kernel.set_arg(kernel.end_index, end);
138 kernel.kernel.set_arg(kernel.unit_index, (std::size_t)Triangular::is_unit);
139 kernel.kernel.set_arg(kernel.upper_index, (std::size_t)Triangular::is_upper);
140
141 std::size_t global_work_size[2] = {numWorkers,1};
142 std::size_t local_work_size[2] = {numWorkers, 1};
143 bfull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,nullptr, global_work_size, local_work_size);
144 return;
145 }
146 std::size_t numBlocks = (size+tileSize-1)/tileSize;
147 std::size_t split = numBlocks/2 * tileSize;
148 auto bfront = subrange(bfull,start,start+split);
149 auto bback = subrange(bfull,start+split,end);
150
151 //otherwise run the kernel recursively
152 if(Triangular::is_upper){ //Upper triangular case
153 auto Aur = subrange(Afull,start,start+split,start+split,end);
154 trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
155 kernels::gemv(Aur, bback, bfront, -1.0);
156 trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
157 }else{// Lower triangular caste
158 auto All = subrange(Afull,start+split,end,start,start+split);
159 trsv_recursive(Afull, bfull, kernel, start,start+split, tileSize, numWorkers, t);
160 kernels::gemv(All, bfront, bback, -1.0);
161 trsv_recursive(Afull, bfull, kernel, start+split,end, tileSize, numWorkers, t);
162 }
163}
164
165template <typename MatA, typename VecB, class Triangular>
166void trsv_call(
167 matrix_expression<MatA, gpu_tag> const& A,
168 vector_expression<VecB, gpu_tag>& b,
169 Triangular,
170 left
171){
172 std::size_t const TileSize = 32;//size of the diagonal blocks where the single kernel runs
173 std::size_t const numWorkers = TileSize; //number of workers
174 char const* options ="-DTILE_SIZE=32ul";
175 auto kernel = bindings::createTRSVDiagBlockKernel(A,b,options);
176 trsv_recursive(A,b,kernel,0,A().size1(), TileSize, numWorkers, Triangular());
177}
178
179template <typename MatA, typename VecB, class Triangular>
180void trsv_call(
181 matrix_expression<MatA, gpu_tag> const& A,
182 vector_expression<VecB, gpu_tag>& b,
183 Triangular,
184 right
185){
186 trsv_call(trans(A),b,typename Triangular::transposed_orientation(),left());
187}
188}
189namespace kernels{
190//main kernel runs the kernel above recursively and calls gemv
191template <class Triangular,class Side, typename MatA, typename VecB>
192void trsv(
193 matrix_expression<MatA, gpu_tag> const& A,
194 vector_expression<VecB, gpu_tag>& b
195){
196 REMORA_SIZE_CHECK(A().size1() == A().size2());
197 REMORA_SIZE_CHECK(A().size2() == b().size());
198 bindings::trsv_call(A,b,Triangular(), Side());
199}
200}}
201#endif