31#ifndef REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
32#define REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
35#include "../../assignment.hpp"
36#include "../../proxy_expressions.hpp"
41namespace remora{
namespace bindings {
53struct gemm_block_size {
54 typedef detail::block<T> block;
55 static const unsigned mr = 4;
56 static const unsigned nr = 3 * block::max_vector_elements;
57 static const unsigned mc = 128;
58 static const unsigned kc = 512;
59 static const unsigned nc = (1024/nr) * nr;
63struct gemm_block_size<float> {
64 typedef detail::block<float> block;
65 static const unsigned mc = 256;
66 static const unsigned kc = 512;
67 static const unsigned nc = 4096;
68 static const unsigned mr = 4;
69 static const unsigned nr = 16;
73struct gemm_block_size<long double> {
74 typedef detail::block<long double> block;
75 static const unsigned mc = 256;
76 static const unsigned kc = 512;
77 static const unsigned nc = 4096;
78 static const unsigned mr = 1;
79 static const unsigned nr = 4;
83template <
class E1,
class E2,
class Mat>
85 matrix_expression<E1, cpu_tag>
const& e1,
86 matrix_expression<E2, cpu_tag>
const& e2,
87 matrix_expression<Mat, cpu_tag>& m,
88 typename Mat::value_type alpha
90 static_assert(std::is_same<typename Mat::orientation,row_major>::value,
"target matrix must be row major");
91 typedef typename std::common_type<
92 typename E1::value_type,
typename E2::value_type,
typename Mat::value_type
95 typedef gemm_block_size<
96 typename std::common_type<typename E1::value_type, typename E2::value_type>::type
99 static const std::size_t MC = block_size::mc;
100 static const std::size_t NC = block_size::nc;
101 static const std::size_t KC = block_size::kc;
104 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
105 value_type* A = allocator.allocate(MC * KC);
106 value_type* B = allocator.allocate(NC * KC);
108 const std::size_t M = m().size1();
109 const std::size_t N = m().size2();
110 const std::size_t K = e1().size2 ();
111 const std::size_t mb = (M+MC-1) / MC;
112 const std::size_t nb = (N+NC-1) / NC;
113 const std::size_t kb = (K+KC-1) / KC;
115 auto storageM = m().raw_storage();
116 auto C_ = storageM.values;
117 const std::size_t ldc = storageM.leading_dimension;
118 for (std::size_t j=0; j<nb; ++j) {
119 std::size_t nc = std::min(NC, N - j*NC);
121 for (std::size_t l=0; l<kb; ++l) {
122 std::size_t kc = std::min(KC, K - l*KC);
123 auto Bs = subrange(e2, l*KC, l*KC+kc, j*NC, j*NC+nc);
124 pack_B_dense(Bs, B, block_size());
126 for (std::size_t i=0; i<mb; ++i) {
127 std::size_t mc = std::min(MC, M - i*MC);
128 auto As = subrange(e1, i*MC, i*MC+mc, l*KC, l*KC+kc);
129 pack_A_dense(As, A, block_size());
132 mc, nc, kc, alpha, A, B,
133 &C_[i*MC*ldc+j*NC], ldc , 1, block_size()
139 allocator.deallocate(A,MC * KC);
140 allocator.deallocate(B,NC * KC);