dense_gemm.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief dense matrix matrix multiplication implementation
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
32#define REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
33
34#include "../gemv.hpp"//for dispatching to gemv
35#include "../../assignment.hpp"//plus_assign
36#include "../../proxy_expressions.hpp"//matrix row,column,transpose,range
37#include "mgemm.hpp" //block macro kernel for dense gemm
38#include <type_traits> //std::common_type
39
40
41namespace remora{namespace bindings {
42
43// Dense Block-GEMM implementation based on boost.ublas
44// written by:
45// Copyright (c) 2016
46// Michael Lehn, Imre Palik
47//
48// Distributed under the Boost Software License, Version 1.0. (See
49// accompanying file LICENSE_1_0.txt or copy at
50// http://www.boost.org/LICENSE_1_0.txt)
51
52template <typename T>
53struct gemm_block_size {
54 typedef detail::block<T> block;
55 static const unsigned mr = 4; // stripe width for lhs
56 static const unsigned nr = 3 * block::max_vector_elements; // stripe width for rhs
57 static const unsigned mc = 128;
58 static const unsigned kc = 512; // stripe length
59 static const unsigned nc = (1024/nr) * nr;
60};
61
62template <>
63struct gemm_block_size<float> {
64 typedef detail::block<float> block;
65 static const unsigned mc = 256;
66 static const unsigned kc = 512; // stripe length
67 static const unsigned nc = 4096;
68 static const unsigned mr = 4; // stripe width for lhs
69 static const unsigned nr = 16; // stripe width for rhs
70};
71
72template <>
73struct gemm_block_size<long double> {
74 typedef detail::block<long double> block;
75 static const unsigned mc = 256;
76 static const unsigned kc = 512; // stripe length
77 static const unsigned nc = 4096;
78 static const unsigned mr = 1; // stripe width for lhs
79 static const unsigned nr = 4; // stripe width for rhs
80};
81
82//-- Dense gemm
83template <class E1, class E2, class Mat>
84void dense_gemm(
85 matrix_expression<E1, cpu_tag> const& e1,
86 matrix_expression<E2, cpu_tag> const& e2,
87 matrix_expression<Mat, cpu_tag>& m,
88 typename Mat::value_type alpha
89){
90 static_assert(std::is_same<typename Mat::orientation,row_major>::value,"target matrix must be row major");
91 typedef typename std::common_type<
92 typename E1::value_type, typename E2::value_type, typename Mat::value_type
93 >::type value_type;
94
95 typedef gemm_block_size<
96 typename std::common_type<typename E1::value_type, typename E2::value_type>::type
97 > block_size;
98
99 static const std::size_t MC = block_size::mc;
100 static const std::size_t NC = block_size::nc;
101 static const std::size_t KC = block_size::kc;
102
103 //obtain uninitialized aligned storage
104 boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
105 value_type* A = allocator.allocate(MC * KC);
106 value_type* B = allocator.allocate(NC * KC);
107
108 const std::size_t M = m().size1();
109 const std::size_t N = m().size2();
110 const std::size_t K = e1().size2 ();
111 const std::size_t mb = (M+MC-1) / MC;
112 const std::size_t nb = (N+NC-1) / NC;
113 const std::size_t kb = (K+KC-1) / KC;
114
115 auto storageM = m().raw_storage();
116 auto C_ = storageM.values;
117 const std::size_t ldc = storageM.leading_dimension;
118 for (std::size_t j=0; j<nb; ++j) {
119 std::size_t nc = std::min(NC, N - j*NC);
120
121 for (std::size_t l=0; l<kb; ++l) {
122 std::size_t kc = std::min(KC, K - l*KC);
123 auto Bs = subrange(e2, l*KC, l*KC+kc, j*NC, j*NC+nc);
124 pack_B_dense(Bs, B, block_size());
125
126 for (std::size_t i=0; i<mb; ++i) {
127 std::size_t mc = std::min(MC, M - i*MC);
128 auto As = subrange(e1, i*MC, i*MC+mc, l*KC, l*KC+kc);
129 pack_A_dense(As, A, block_size());
130
131 mgemm(
132 mc, nc, kc, alpha, A, B,
133 &C_[i*MC*ldc+j*NC], ldc , 1, block_size()
134 );
135 }
136 }
137 }
138 //free storage
139 allocator.deallocate(A,MC * KC);
140 allocator.deallocate(B,NC * KC);
141}
142
143}}
144#endif