trmm.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief -
5 *
6 * \author O. Krause
7 * \date 2010
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_DEFAULT_TRMM_HPP
32#define REMORA_KERNELS_DEFAULT_TRMM_HPP
33
34#include "../../expression_types.hpp"//for matrix_expression
35#include "../../proxy_expressions.hpp"//proxies for blocking
36#include "../../detail/traits.hpp"//triangular_tag
37#include "simd.hpp"
38#include "mgemm.hpp" //block macro kernel for dense syrk
39#include <type_traits> //std::false_type marker for unoptimized, std::common_type
40namespace remora{ namespace bindings {
41
42template <typename T>
43struct trmm_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4; // stripe width for lhs
46 static const unsigned nr = 3 * block::max_vector_elements; // stripe width for rhs
47 static const unsigned lhs_block_size = 32 * mr;
48 static const unsigned rhs_column_size = (1024 / nr) * nr;
49 static const unsigned align = 64; // align temporary arrays to this boundary
50};
51
52template <class E, class T, class block_size, bool unit>
53void pack_A_triangular(matrix_expression<E, cpu_tag> const& A, T* p, block_size, triangular_tag<false,unit>){
54 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
55
56 std::size_t const mc = A().size1();
57 std::size_t const kc = A().size2();
58 static std::size_t const MR = block_size::mr;
59 const std::size_t mp = (mc+MR-1) / MR;
60
61 std::size_t nu = 0;
62 for (std::size_t l=0; l<mp; ++l) {
63 for (std::size_t j=0; j<kc; ++j) {
64 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
65 if(unit && i == j)
66 p[nu] = 1.0;
67 else
68 p[nu] = ((i<mc) && (i >= j)) ? A()(i,j) : T(0);
69 }
70 }
71 }
72}
73
74template <class E, class T, class block_size, bool unit>
75void pack_A_triangular(matrix_expression<E, cpu_tag> const& A, T* p, block_size, triangular_tag<true,unit>){
76 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
77
78 std::size_t const mc = A().size1();
79 std::size_t const kc = A().size2();
80 static std::size_t const MR = block_size::mr;
81 const std::size_t mp = (mc+MR-1) / MR;
82
83 std::size_t nu = 0;
84 for (std::size_t l=0; l<mp; ++l) {
85 for (std::size_t j=0; j<kc; ++j) {
86 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
87 if(unit && i == j)
88 p[nu] = 1.0;
89 else
90 p[nu] = ((i<mc) && (i <= j)) ? A()(i,j) : T(0);
91 }
92 }
93 }
94}
95
96
97template <class E1, class E2, class Triangular>
98void trmm_impl(
99 matrix_expression<E1, cpu_tag> const& e1,
100 matrix_expression<E2, cpu_tag> & e2,
101 Triangular t
102){
103 typedef typename std::common_type<typename E1::value_type, typename E2::value_type>::type value_type;
104 typedef trmm_block_size<value_type> block_size;
105
106 static const std::size_t AC = block_size::lhs_block_size;
107 static const std::size_t BC = block_size::rhs_column_size;
108
109 //obtain uninitialized aligned storage
110 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
111 value_type* A = allocator.allocate(AC * AC);
112 value_type* B = allocator.allocate(AC * BC);
113
114 //figure out number of blocks to use
115 const std::size_t M = e1().size1();
116 const std::size_t N = e2().size2();
117 const std::size_t Ab = (M+AC-1) / AC;
118 const std::size_t Bb = (N+BC-1) / BC;
119
120 //get access to raw storage of B
121 auto storageB = e2().raw_storage();
122 auto Bpointer = storageB.values;
123 const std::size_t stride1 = E2::orientation::index_M(storageB.leading_dimension,1);
124 const std::size_t stride2 = E2::orientation::index_m(storageB.leading_dimension,1);
125
126 for (std::size_t j = 0; j < Bb; ++j) {//columns of B
127 std::size_t nc = std::min(BC, N - j * BC);
128 //We have to make use of the triangular nature of B and the fact that rhs and storage are the same matrix
129 //for upper triangular matrices we can compute whole columns of A starting from the first
130 //until we reach the block on the diagonal, where we stop.
131 //for lower triangular matrices we start with the last column instead.
132 for (std::size_t k0 = 0; k0< Ab; ++k0){//row-blocks of B = column blocks of A.
133 std::size_t k = Triangular::is_upper? k0: Ab-k0-1;
134 std::size_t kc = std::min(AC, M - k * AC);
135 //read block of B
136 auto Bs = subrange(e2, k*AC, k*AC + kc, j * BC, j * BC + nc );
137 pack_B_dense(Bs, B, block_size());
138 Bs.clear();//its going to be overwritten with the result
139
140 //apply block of B to all blocks of A needing it. we will overwrite the real B block as a result of this.
141 //this is okay as the block is stored as argument
142 std::size_t i_start = Triangular::is_upper? 0: k;
143 std::size_t i_end = Triangular::is_upper? k+1: Ab;
144 for (std::size_t i = i_start; i < i_end; ++i) {
145 std::size_t mc = std::min(AC, M - i * AC);
146 auto As = subrange(e1, i * AC, i * AC + mc, k * AC, k * AC + kc);
147 if(i == k){
148 pack_A_triangular(As, A, block_size(), t);
149 }else
150 pack_A_dense(As, A, block_size());
151
152 mgemm(
153 mc, nc, kc, value_type(1.0), A, B,
154 &Bpointer[i*AC * stride1 + j*BC * stride2], stride1, stride2, block_size()
155 );
156 }
157 }
158 }
159 //free storage
160 allocator.deallocate(A,AC * AC);
161 allocator.deallocate(B,AC * BC);
162}
163
164
165
166//main kernel runs the kernel above recursively and calls gemv
167template <bool Upper,bool Unit,typename MatA, typename MatB>
168void trmm(
169 matrix_expression<MatA, cpu_tag> const& A,
170 matrix_expression<MatB, cpu_tag>& B,
171 std::false_type //unoptimized
172){
173 REMORA_SIZE_CHECK(A().size1() == A().size2());
174 REMORA_SIZE_CHECK(A().size2() == B().size1());
175
176 trmm_impl(A,B, triangular_tag<Upper,Unit>());
177}
178
179}}
180
181#endif