potrf.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Implements the default implementation of the POTRF algorithm
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2014 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30#ifndef REMORA_KERNELS_GPU_POTRF_HPP
31#define REMORA_KERNELS_GPU_POTRF_HPP
32
33#include "../../proxy_expressions.hpp"
34#include "../trsm.hpp" //trsm kernel
35#include "../syrk.hpp" //syrk kernel
36
37namespace remora{namespace bindings {
38
39
40struct potrf_kernel{
41 boost::compute::kernel kernel;
42 std::size_t start_index;
43 std::size_t end_index;
44};
45//Lower triangular - matrix(row-major)
46template<class MatA>
47potrf_kernel createPotrfDiagBlockKernel(
48 matrix_expression<MatA, gpu_tag>& A_unreg,
49 char const* options
50){
51 typedef typename MatA::value_type value_type;
52
53 gpu::detail::meta_kernel k("blas_potrf");
54 std::size_t start_index = k.add_arg<std::size_t>("start");//start of block of A
55 std::size_t end_index = k.add_arg<std::size_t>("end");//end of Block of A
56 auto A = k.register_args(to_functor(A_unreg));
57 // Local memory to fit a tile of A and B
58 // we store B as column major in local memory
59 // we also allocate memory to store results of B
60 k << "__local " <<k.decl<value_type>("Asub")<< "[TILE_SIZE][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
61 k << "const ulong numWorkers = get_local_size(0);\n";
62 //ensure we are not reading out of bounds
63 k << "const ulong t = get_group_id(1);\n";
64 k << "const ulong curTileA = end-start;\n";
65
66 // Load tile of A into local memory
67 k << "for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
68 k << " for(ulong j = get_local_id(1); j < TILE_SIZE; j += numWorkers){\n";
69 k << " Asub[i][j] ="<< A(k.expr<cl_ulong>("min(end-1, start + i)"),k.expr<cl_ulong>("min(end-1, start + j)"))<<";\n";
70 k << " }\n";
71 k << "}\n";
72 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
73 // Loop over the values of a single tile
74 //upper-case
75 k << "if(get_local_id(0) == 0 && get_local_id(1) == 0){\n";
76 k << " for(ulong j = 0; j < TILE_SIZE; j++) {\n";
77 k << k.decl<value_type>("Ajj") <<"= sqrt(Asub[j][j]);\n";
78 k << " Asub[j][j] = Ajj;\n";
79 k << " for(ulong i = j + 1; i < TILE_SIZE; ++i) {\n";
80 k << " Asub[i][j] /= Ajj;\n";
81 k << " }\n";
82 //rank-one update
83 k << " for(ulong k = j + 1; k < TILE_SIZE; k++) {\n";
84 k << " for(ulong i = k; i < TILE_SIZE; ++i) {\n";
85 k << " Asub[i][k] -= Asub[i][j] * Asub[k][j];\n";
86 k << " }\n";
87 k << " }\n";
88 k << " }\n";
89 k << "}\n";
90 // Synchronise before continuing
91 k << "barrier(CLK_LOCAL_MEM_FENCE);\n";
92 // Store the final results back in A
93 k << "for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
94 k << " for(ulong j = get_local_id(1); j < curTileA; j += numWorkers){\n";
95 k << A(k.expr<cl_ulong>("(start+i)"),k.expr<cl_ulong>("(start+j)"))<<" = Asub[i][j];\n";
96 k << " }\n";
97 k << "}\n";
98
99 boost::compute::kernel kernel = k.compile(A_unreg().queue().get_context(), options);
100 return {kernel,start_index,end_index};
101}
102//main kernel for large matrices
103template <typename MatA>
104void potrf_recursive(
105 matrix_expression<MatA, gpu_tag>& Afull,
106 std::size_t start,
107 std::size_t end,
108 potrf_kernel& kernel
109){
110 std::size_t block_size = 16;
111 std::size_t num_workers = 8;
112 auto A = subrange(Afull,start,end,start,end);
113 std::size_t size = A.size1();
114 //if the matrix is small enough call the computation kernel directly for the block
115 if(size <= block_size){
116 kernel.kernel.set_arg(kernel.start_index, start);
117 kernel.kernel.set_arg(kernel.end_index, end);
118
119 std::size_t global_work_size[2] = {num_workers,num_workers};
120 std::size_t local_work_size[2] = {num_workers, num_workers};
121 Afull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,nullptr, global_work_size, local_work_size);
122 return;
123 }
124 std::size_t numBlocks = (A.size1()+block_size-1)/block_size;
125 std::size_t split = numBlocks/2*block_size;
126
127
128 //otherwise run the kernel recursively
129 potrf_recursive(Afull,start,start+split, kernel);
130
131 auto Aul = subrange(A,0,split,0,split);
132 auto All = subrange(A,split,size,0,split);
133 auto Alr = subrange(A,split,size,split,size);
134 kernels::trsm<upper,right>(trans(Aul), All );
135 kernels::syrk<false>(All,Alr, -1.0);
136 potrf_recursive(Afull,start+split,end, kernel);
137}
138
139template <typename MatA>
140void potrf_dispatch(
141 matrix_expression<MatA, gpu_tag>& A,
142 lower
143){
144 char const* options ="-DTILE_SIZE=16ul";
145 auto kernel = bindings::createPotrfDiagBlockKernel(A,options);
146 potrf_recursive(A,0,A().size1(), kernel);
147}
148template <typename MatA>
149void potrf_dispatch(
150 matrix_expression<MatA, gpu_tag>& A,
151 upper
152){
153 auto Atrans = trans(A);
154 char const* options ="-DTILE_SIZE=16ul";
155 auto kernel = bindings::createPotrfDiagBlockKernel(Atrans,options);
156 potrf_recursive(Atrans,0,Atrans().size1(), kernel);
157}
158}
159
160namespace kernels {
161template <class Triangular, typename MatA>
162std::size_t potrf(
163 matrix_container<MatA, gpu_tag>& A
164){
165 REMORA_SIZE_CHECK(A().size1() == A().size2());
166 bindings::potrf_dispatch(A, Triangular());
167 return 0;
168}
169
170}}
171#endif