getrf.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Implements the default implementation of the getrf algorithm
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2014 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30#ifndef REMORA_KERNELS_DEFAULT_GETRF_HPP
31#define REMORA_KERNELS_DEFAULT_GETRF_HPP
32
33#include "../../proxy_expressions.hpp" //proxies for recursive blocking
34#include "../trsm.hpp" //trsm kernel
35#include "../gemm.hpp" //gemm kernel
36#include "../../permutation.hpp" //pivoting
37#include <vector>
38
39namespace remora{namespace bindings {
40
41//diagonal block kernels
42template<class MatA, class VecP>
43void getrf_block(
44 matrix_expression<MatA, cpu_tag>& A,
45 vector_expression<VecP, cpu_tag>& P,
46 column_major
47) {
48 for(std::size_t j = 0; j != A().size2(); ++j){
49 //search pivot
50 double pivot_value = A()(j,j);
51 P()(j) = j;
52 for(std::size_t i = j+1; i != A().size1(); ++i){
53 if(std::abs(A()(i,j)) > std::abs(pivot_value)){
54 P()(j) = i;
55 pivot_value = A()(i,j);
56 }
57 }
58 if(pivot_value == 0)
59 throw std::invalid_argument("[getrf] Matrix is rank deficient or numerically unstable");
60 //apply row pivoting if needed
61 if(std::size_t(P()(j)) != j){
62 A().swap_rows(j,P()(j));
63 }
64
65 //by definition, L11= 1 and U11=pivot_value
66 //so we only need to transform the current column
67 //And can skip the current row
68 for(std::size_t i = j+1; i != A().size1(); ++i){
69 A()(i,j) /= pivot_value;
70 }
71
72
73 //but we have to apply the outer product to the
74 //lower right matrix
75 for(std::size_t k = j+1; k != A().size2(); ++k){
76 for(std::size_t i = j+1; i != A().size1(); ++i){
77 A()(i,k) -= A()(i,j) * A()(j,k);
78 }
79 }
80 }
81}
82
83
84template<class MatA, class VecP>
85void getrf_block(
86 matrix_expression<MatA, cpu_tag>& A,
87 vector_expression<VecP, cpu_tag>& P,
88 row_major
89) {
90 //there is no way to do fast row pivoting on row-major format.
91 //so copy the block into column major format, perform the decomposition
92 // and copy back.
93 typedef typename MatA::value_type value_type;
94 std::vector<value_type> storage(A().size1() * A().size2());
95 dense_matrix_adaptor<value_type, column_major> colBlock(storage.data(), A().size1(), A().size2());
96 kernels::assign(colBlock, A);
97 getrf_block(colBlock, P, column_major());
98 kernels::assign(A, colBlock);
99}
100
101//todo: row-major needs to copy the block in temporary storage
102
103//main kernel for large matrices
104//we recursively split the matrix into panels along the columns
105//until a panel is small enough. Then we perform the LU decomposition
106//on that panel and update the remaining matrix accordingly.
107// For a given blocking of A
108// | A11 | A12 |
109// A = | --------- |
110// | A21 | A22 |
111// where A11 and A22 are square matrices, the LU decomposition
112// is computed recursively by applying the LU decomposition on block A11
113// to obtain A11= L11*U11 where L is unit-lower triangular and U
114// upper triangular.
115// Then we compute
116// A12<-L11^{-1}A21
117// A21<-A21 U11^{-1}
118// A22<- A22 - A21 * A12
119// and perform the LU-decomposition on A22.
120// in practice the LU decomposition requires a permutation P to be stable
121// where in each iteration the best pivot along the current column is
122// searched. This leads to a panel-wise computation where the
123// computation of A11 and A21 is performed together in order
124// to correctly apply the permutation.
125// afterwards the permutation has to be applied to A12 and A22 before
126// computing their blocks. When A22 is computed, it contains an additional permutation
127// that has to be applied to A21 as well.
128template <typename MatA, typename VecP>
129void getrf_recursive(
130 matrix_expression<MatA, cpu_tag>& A,
131 vector_expression<VecP, cpu_tag>& P,
132 std::size_t start,
133 std::size_t end
134){
135 std::size_t block_size = 4;
136 std::size_t size = end-start;
137 std::size_t end1=A().size1();
138
139 //if the matrix is small enough, call the computation kernel directly for the block
140 if(size <= block_size){
141 auto Ablock = subrange(A, start, end1, start, end); //recursive getrf needs all columns
142 auto Pblock = subrange(P, start, end);
143 getrf_block(Ablock,Pblock, typename MatA::orientation());
144 return;
145 }
146
147 //otherwise run the kernel recursively
148 std::size_t numBlocks = (size + block_size - 1) / block_size;
149 std::size_t split = start + numBlocks/2 * block_size;
150 auto A_2 = subrange(A, start, end1, split, end);
151 auto A11 = subrange(A, start, split, start, split);
152 auto A12 = subrange(A, start, split, split, end);
153 auto A21 = subrange(A, split, end1, start, split);
154 auto A22 = subrange(A, split, end1, split, end);
155 auto P1 = subrange(P, start, split);
156 auto P2 = subrange(P, split, end);
157
158
159 //run recursively on the first block
160 getrf_recursive(A, P, start, split);
161
162 //block A21 is already transformed
163
164 //apply permutation to block A12 and A22
165 swap_rows(P1, A_2);
166 // solve system in A12
167 kernels::trsm<unit_lower, left>(A11, A12);
168
169 //update block A22
170 kernels::gemm(A21, A12, A22, -1);
171
172 //call recursively getrf on A22
173 getrf_recursive(A, P, split, end);
174
175 //permute A21 and update P2 to reflect the full matrix
176 swap_rows(P2, A21);
177 for(auto& p: P2){
178 p += split-start;
179 }
180}
181
182template <typename MatA, typename VecP>
183void getrf(
184 matrix_expression<MatA, cpu_tag>& A,
185 vector_expression<VecP, cpu_tag>& P
186){
187 for(std::size_t i = 0; i != P().size(); ++i){
188 P()(i) = i;
189 }
190 getrf_recursive(A, P, 0, A().size1());
191}
192}}
193#endif