syrk.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief -
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_DEFAULT_SYRK_HPP
32#define REMORA_KERNELS_DEFAULT_SYRK_HPP
33
34#include "../../expression_types.hpp"//for matrix_expression
35#include "../../proxy_expressions.hpp"//for matrix range/transpose
36#include "mgemm.hpp" //block macro kernel for dense syrk
37#include <type_traits> //std::false_type marker for unoptimized, std::common_Type
38
39namespace remora { namespace bindings {
40
41
42template <typename T>
43struct syrk_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4; // stripe width for E_left
46 static const unsigned nr = mr * block::max_vector_elements; // stripe width for E_right
47 static const unsigned lhs_block_size = 3 * mr * nr;//square block size of M to compute
48 static const unsigned rhs_k_size = 1024;//strip of ks to compute
49};
50template <class E, class Mat, class Triangular>
51void syrk_impl(
52 matrix_expression<E, cpu_tag> const& e,
53 matrix_expression<Mat, cpu_tag>& m,
54 typename Mat::value_type& alpha,
55 Triangular t
56){
57 typedef typename E::value_type value_type;
58 typedef syrk_block_size<value_type> block_size;
59
60 static const std::size_t MC = block_size::lhs_block_size;
61 static const std::size_t EC = block_size::rhs_k_size;
62
63 //obtain uninitialized aligned storage
64 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocatorE;
65 boost::alignment::aligned_allocator<typename Mat::value_type,block_size::block::align> allocatorM;
66 value_type* E_left = allocatorE.allocate(MC * EC);
67 value_type* E_right = allocatorE.allocate(MC * EC);
68 auto M_diagonal_block = allocatorM.allocate(MC * MC);
69
70 //figure out number of blocks to use
71 const std::size_t M = e().size1();
72 const std::size_t K = e().size2();
73 const std::size_t Mb = (M+MC-1) / MC;//we split m in Mb x Mb blocks
74 const std::size_t Eb = (K+EC-1) / EC;//we split B in Mb x Eb blocks
75
76 //get access to raw storage of M
77 auto storageM = m().raw_storage();
78 auto Mpointer = storageM.values;
79 const std::size_t stride1 = Mat::orientation::index_M(storageM.leading_dimension,1);
80 const std::size_t stride2 = Mat::orientation::index_m(storageM.leading_dimension,1);
81
82 for (std::size_t k = 0; k < Eb; ++k) {//column blocks of E
83 std::size_t kc = std::min(EC, K - k * EC);
84 for (std::size_t i = 0; i < Mb; ++i){//row-blocks of M
85 std::size_t mc = std::min(MC, M - i * MC);
86 //load block of the left E into memory
87 auto E_lefts = subrange(e, i * MC, i * MC + mc, k*EC, k*EC + kc );
88 pack_A_dense(E_lefts, E_left, block_size());
89
90 std::size_t start_j = Triangular::is_upper? i : 0;
91 std::size_t end_j = Triangular::is_upper? Mb : i+1;
92 for(std::size_t j = start_j; j < end_j; ++j){//traverse over the blocks that are to be computed
93 std::size_t mc2 = std::min(MC, M - j * MC);
94 //load block of the right E into memory
95 auto E_rights = subrange(e, j * MC, j * MC + mc2, k*EC, k*EC + kc );
96 auto E_rights_trans = trans(E_rights);
97 //~ auto E_rights_trans = trans(subrange(e, j * MC, j * MC + mc2, k*EC, k*EC + kc ));
98 pack_B_dense(E_rights_trans, E_right, block_size());
99
100 if(i==j){//diagonal block: we have to ensure that we only access elements on the diagonal
101 for(std::size_t i0 = 0; i0 != mc; ++i0){
102 for(std::size_t j0 = 0; j0 != mc2; ++j0){
103 M_diagonal_block[i0*MC+j0] = 0.0;
104 }
105 }
106 mgemm(
107 mc, mc2, kc, alpha, E_left, E_right,
108 M_diagonal_block, MC, 1, block_size()
109 );
110 auto M_diagonal = Mpointer + i * MC * stride1 + j * MC * stride2;
111 for(std::size_t i0 = 0; i0 != mc; ++i0){
112 std::size_t start_j0 = Triangular::is_upper? i0 : 0;
113 std::size_t end_j0 = Triangular::is_upper? mc2 : i0+1;
114 for(std::size_t j0 = start_j0; j0 < end_j0; ++j0){
115 M_diagonal[i0*stride1+j0*stride2] += M_diagonal_block[i0*MC+j0];
116 }
117 }
118 }else{
119 mgemm(
120 mc, mc2, kc, alpha, E_left, E_right,
121 &Mpointer[i*MC * stride1 + j*MC * stride2], stride1, stride2, block_size()
122 );
123 }
124 }
125 }
126 }
127 //free storage
128 allocatorE.deallocate(E_left,MC * EC);
129 allocatorE.deallocate(E_right,MC * EC);
130 allocatorM.deallocate(M_diagonal_block, MC * MC);
131}
132
133
134
135//main kernel runs the kernel above recursively and calls gemv
136template <bool Upper, typename M, typename E>
137void syrk(
138 matrix_expression<E, cpu_tag> const& e,
139 matrix_expression<M, cpu_tag>& m,
140 typename M::value_type& alpha,
141 std::false_type //unoptimized
142){
143 REMORA_SIZE_CHECK(m().size1() == m().size2());
144 REMORA_SIZE_CHECK(m().size2() == e().size1());
145
146 syrk_impl(e,m, alpha, triangular_tag<Upper,false>());
147}
148
149}}
150
151#endif