31#ifndef REMORA_KERNELS_DEFAULT_SYRK_HPP
32#define REMORA_KERNELS_DEFAULT_SYRK_HPP
34#include "../../expression_types.hpp"
35#include "../../proxy_expressions.hpp"
39namespace remora {
namespace bindings {
43struct syrk_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4;
46 static const unsigned nr = mr * block::max_vector_elements;
47 static const unsigned lhs_block_size = 3 * mr * nr;
48 static const unsigned rhs_k_size = 1024;
50template <
class E,
class Mat,
class Triangular>
52 matrix_expression<E, cpu_tag>
const& e,
53 matrix_expression<Mat, cpu_tag>& m,
54 typename Mat::value_type& alpha,
57 typedef typename E::value_type value_type;
58 typedef syrk_block_size<value_type> block_size;
60 static const std::size_t MC = block_size::lhs_block_size;
61 static const std::size_t EC = block_size::rhs_k_size;
64 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocatorE;
65 boost::alignment::aligned_allocator<typename Mat::value_type,block_size::block::align> allocatorM;
66 value_type* E_left = allocatorE.allocate(MC * EC);
67 value_type* E_right = allocatorE.allocate(MC * EC);
68 auto M_diagonal_block = allocatorM.allocate(MC * MC);
71 const std::size_t M = e().size1();
72 const std::size_t K = e().size2();
73 const std::size_t Mb = (M+MC-1) / MC;
74 const std::size_t Eb = (K+EC-1) / EC;
77 auto storageM = m().raw_storage();
78 auto Mpointer = storageM.values;
79 const std::size_t stride1 = Mat::orientation::index_M(storageM.leading_dimension,1);
80 const std::size_t stride2 = Mat::orientation::index_m(storageM.leading_dimension,1);
82 for (std::size_t k = 0; k < Eb; ++k) {
83 std::size_t kc = std::min(EC, K - k * EC);
84 for (std::size_t i = 0; i < Mb; ++i){
85 std::size_t mc = std::min(MC, M - i * MC);
87 auto E_lefts = subrange(e, i * MC, i * MC + mc, k*EC, k*EC + kc );
88 pack_A_dense(E_lefts, E_left, block_size());
90 std::size_t start_j = Triangular::is_upper? i : 0;
91 std::size_t end_j = Triangular::is_upper? Mb : i+1;
92 for(std::size_t j = start_j; j < end_j; ++j){
93 std::size_t mc2 = std::min(MC, M - j * MC);
95 auto E_rights = subrange(e, j * MC, j * MC + mc2, k*EC, k*EC + kc );
96 auto E_rights_trans = trans(E_rights);
98 pack_B_dense(E_rights_trans, E_right, block_size());
101 for(std::size_t i0 = 0; i0 != mc; ++i0){
102 for(std::size_t j0 = 0; j0 != mc2; ++j0){
103 M_diagonal_block[i0*MC+j0] = 0.0;
107 mc, mc2, kc, alpha, E_left, E_right,
108 M_diagonal_block, MC, 1, block_size()
110 auto M_diagonal = Mpointer + i * MC * stride1 + j * MC * stride2;
111 for(std::size_t i0 = 0; i0 != mc; ++i0){
112 std::size_t start_j0 = Triangular::is_upper? i0 : 0;
113 std::size_t end_j0 = Triangular::is_upper? mc2 : i0+1;
114 for(std::size_t j0 = start_j0; j0 < end_j0; ++j0){
115 M_diagonal[i0*stride1+j0*stride2] += M_diagonal_block[i0*MC+j0];
120 mc, mc2, kc, alpha, E_left, E_right,
121 &Mpointer[i*MC * stride1 + j*MC * stride2], stride1, stride2, block_size()
128 allocatorE.deallocate(E_left,MC * EC);
129 allocatorE.deallocate(E_right,MC * EC);
130 allocatorM.deallocate(M_diagonal_block, MC * MC);
136template <
bool Upper,
typename M,
typename E>
138 matrix_expression<E, cpu_tag>
const& e,
139 matrix_expression<M, cpu_tag>& m,
140 typename M::value_type& alpha,
143 REMORA_SIZE_CHECK(m().size1() == m().size2());
144 REMORA_SIZE_CHECK(m().size2() == e().size1());
146 syrk_impl(e,m, alpha, triangular_tag<Upper,false>());