31#ifndef REMORA_KERNELS_DEFAULT_TRMM_HPP
32#define REMORA_KERNELS_DEFAULT_TRMM_HPP
34#include "../../expression_types.hpp"
35#include "../../proxy_expressions.hpp"
36#include "../../detail/traits.hpp"
40namespace remora{
namespace bindings {
43struct trmm_block_size {
44 typedef detail::block<T> block;
45 static const unsigned mr = 4;
46 static const unsigned nr = 3 * block::max_vector_elements;
47 static const unsigned lhs_block_size = 32 * mr;
48 static const unsigned rhs_column_size = (1024 / nr) * nr;
49 static const unsigned align = 64;
52template <
class E,
class T,
class block_size,
bool unit>
53void pack_A_triangular(matrix_expression<E, cpu_tag>
const& A, T* p, block_size, triangular_tag<false,unit>){
54 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
56 std::size_t
const mc = A().size1();
57 std::size_t
const kc = A().size2();
58 static std::size_t
const MR = block_size::mr;
59 const std::size_t mp = (mc+MR-1) / MR;
62 for (std::size_t l=0; l<mp; ++l) {
63 for (std::size_t j=0; j<kc; ++j) {
64 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
68 p[nu] = ((i<mc) && (i >= j)) ? A()(i,j) : T(0);
74template <
class E,
class T,
class block_size,
bool unit>
75void pack_A_triangular(matrix_expression<E, cpu_tag>
const& A, T* p, block_size, triangular_tag<true,unit>){
76 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
78 std::size_t
const mc = A().size1();
79 std::size_t
const kc = A().size2();
80 static std::size_t
const MR = block_size::mr;
81 const std::size_t mp = (mc+MR-1) / MR;
84 for (std::size_t l=0; l<mp; ++l) {
85 for (std::size_t j=0; j<kc; ++j) {
86 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
90 p[nu] = ((i<mc) && (i <= j)) ? A()(i,j) : T(0);
97template <
class E1,
class E2,
class Triangular>
99 matrix_expression<E1, cpu_tag>
const& e1,
100 matrix_expression<E2, cpu_tag> & e2,
103 typedef typename std::common_type<typename E1::value_type, typename E2::value_type>::type value_type;
104 typedef trmm_block_size<value_type> block_size;
106 static const std::size_t AC = block_size::lhs_block_size;
107 static const std::size_t BC = block_size::rhs_column_size;
110 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
111 value_type* A = allocator.allocate(AC * AC);
112 value_type* B = allocator.allocate(AC * BC);
115 const std::size_t M = e1().size1();
116 const std::size_t N = e2().size2();
117 const std::size_t Ab = (M+AC-1) / AC;
118 const std::size_t Bb = (N+BC-1) / BC;
121 auto storageB = e2().raw_storage();
122 auto Bpointer = storageB.values;
123 const std::size_t stride1 = E2::orientation::index_M(storageB.leading_dimension,1);
124 const std::size_t stride2 = E2::orientation::index_m(storageB.leading_dimension,1);
126 for (std::size_t j = 0; j < Bb; ++j) {
127 std::size_t nc = std::min(BC, N - j * BC);
132 for (std::size_t k0 = 0; k0< Ab; ++k0){
133 std::size_t k = Triangular::is_upper? k0: Ab-k0-1;
134 std::size_t kc = std::min(AC, M - k * AC);
136 auto Bs = subrange(e2, k*AC, k*AC + kc, j * BC, j * BC + nc );
137 pack_B_dense(Bs, B, block_size());
142 std::size_t i_start = Triangular::is_upper? 0: k;
143 std::size_t i_end = Triangular::is_upper? k+1: Ab;
144 for (std::size_t i = i_start; i < i_end; ++i) {
145 std::size_t mc = std::min(AC, M - i * AC);
146 auto As = subrange(e1, i * AC, i * AC + mc, k * AC, k * AC + kc);
148 pack_A_triangular(As, A, block_size(), t);
150 pack_A_dense(As, A, block_size());
153 mc, nc, kc, value_type(1.0), A, B,
154 &Bpointer[i*AC * stride1 + j*BC * stride2], stride1, stride2, block_size()
160 allocator.deallocate(A,AC * AC);
161 allocator.deallocate(B,AC * BC);
167template <
bool Upper,
bool Unit,
typename MatA,
typename MatB>
169 matrix_expression<MatA, cpu_tag>
const& A,
170 matrix_expression<MatB, cpu_tag>& B,
173 REMORA_SIZE_CHECK(A().size1() == A().size2());
174 REMORA_SIZE_CHECK(A().size2() == B().size1());
176 trmm_impl(A,B, triangular_tag<Upper,Unit>());