30#ifndef REMORA_KERNELS_GPU_POTRF_HPP
31#define REMORA_KERNELS_GPU_POTRF_HPP
33#include "../../proxy_expressions.hpp"
37namespace remora{
namespace bindings {
41 boost::compute::kernel kernel;
42 std::size_t start_index;
43 std::size_t end_index;
47potrf_kernel createPotrfDiagBlockKernel(
48 matrix_expression<MatA, gpu_tag>& A_unreg,
51 typedef typename MatA::value_type value_type;
53 gpu::detail::meta_kernel k(
"blas_potrf");
54 std::size_t start_index = k.add_arg<std::size_t>(
"start");
55 std::size_t end_index = k.add_arg<std::size_t>(
"end");
56 auto A = k.register_args(to_functor(A_unreg));
60 k <<
"__local " <<k.decl<value_type>(
"Asub")<<
"[TILE_SIZE][TILE_SIZE+2];\n";
61 k <<
"const ulong numWorkers = get_local_size(0);\n";
63 k <<
"const ulong t = get_group_id(1);\n";
64 k <<
"const ulong curTileA = end-start;\n";
67 k <<
"for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
68 k <<
" for(ulong j = get_local_id(1); j < TILE_SIZE; j += numWorkers){\n";
69 k <<
" Asub[i][j] ="<< A(k.expr<cl_ulong>(
"min(end-1, start + i)"),k.expr<cl_ulong>(
"min(end-1, start + j)"))<<
";\n";
72 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
75 k <<
"if(get_local_id(0) == 0 && get_local_id(1) == 0){\n";
76 k <<
" for(ulong j = 0; j < TILE_SIZE; j++) {\n";
77 k << k.decl<value_type>(
"Ajj") <<
"= sqrt(Asub[j][j]);\n";
78 k <<
" Asub[j][j] = Ajj;\n";
79 k <<
" for(ulong i = j + 1; i < TILE_SIZE; ++i) {\n";
80 k <<
" Asub[i][j] /= Ajj;\n";
83 k <<
" for(ulong k = j + 1; k < TILE_SIZE; k++) {\n";
84 k <<
" for(ulong i = k; i < TILE_SIZE; ++i) {\n";
85 k <<
" Asub[i][k] -= Asub[i][j] * Asub[k][j];\n";
91 k <<
"barrier(CLK_LOCAL_MEM_FENCE);\n";
93 k <<
"for(ulong i = get_local_id(0); i < curTileA; i += numWorkers){\n";
94 k <<
" for(ulong j = get_local_id(1); j < curTileA; j += numWorkers){\n";
95 k << A(k.expr<cl_ulong>(
"(start+i)"),k.expr<cl_ulong>(
"(start+j)"))<<
" = Asub[i][j];\n";
99 boost::compute::kernel kernel = k.compile(A_unreg().queue().get_context(), options);
100 return {kernel,start_index,end_index};
103template <
typename MatA>
105 matrix_expression<MatA, gpu_tag>& Afull,
110 std::size_t block_size = 16;
111 std::size_t num_workers = 8;
112 auto A = subrange(Afull,start,end,start,end);
113 std::size_t size = A.size1();
115 if(size <= block_size){
116 kernel.kernel.set_arg(kernel.start_index, start);
117 kernel.kernel.set_arg(kernel.end_index, end);
119 std::size_t global_work_size[2] = {num_workers,num_workers};
120 std::size_t local_work_size[2] = {num_workers, num_workers};
121 Afull().queue().enqueue_nd_range_kernel(kernel.kernel, 2,
nullptr, global_work_size, local_work_size);
124 std::size_t numBlocks = (A.size1()+block_size-1)/block_size;
125 std::size_t split = numBlocks/2*block_size;
129 potrf_recursive(Afull,start,start+split, kernel);
131 auto Aul = subrange(A,0,split,0,split);
132 auto All = subrange(A,split,size,0,split);
133 auto Alr = subrange(A,split,size,split,size);
134 kernels::trsm<upper,right>(trans(Aul), All );
135 kernels::syrk<false>(All,Alr, -1.0);
136 potrf_recursive(Afull,start+split,end, kernel);
139template <
typename MatA>
141 matrix_expression<MatA, gpu_tag>& A,
144 char const* options =
"-DTILE_SIZE=16ul";
145 auto kernel = bindings::createPotrfDiagBlockKernel(A,options);
146 potrf_recursive(A,0,A().size1(), kernel);
148template <
typename MatA>
150 matrix_expression<MatA, gpu_tag>& A,
153 auto Atrans = trans(A);
154 char const* options =
"-DTILE_SIZE=16ul";
155 auto kernel = bindings::createPotrfDiagBlockKernel(Atrans,options);
156 potrf_recursive(Atrans,0,Atrans().size1(), kernel);
161template <
class Triangular,
typename MatA>
163 matrix_container<MatA, gpu_tag>& A
165 REMORA_SIZE_CHECK(A().size1() == A().size2());
166 bindings::potrf_dispatch(A, Triangular());