mgemm.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief The mgemm macro kernel used for implementing gemm
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_DEFAULT_MGEMM_HPP
32#define REMORA_KERNELS_DEFAULT_MGEMM_HPP
33
34#include "simd.hpp"
35#include <algorithm>//std::fill
36
37
38namespace remora{namespace bindings {
39
40// Block-GEMM implementation based on boost.ublas
41// written by:
42// Copyright (c) 2016
43// Michael Lehn, Imre Palik
44//
45// Distributed under the Boost Software License, Version 1.0. (See
46// accompanying file LICENSE_1_0.txt or copy at
47// http://www.boost.org/LICENSE_1_0.txt)
48
49//-- Micro Kernel For Dense operations----------------------------------------------------------
50template <class block_size, class T, class TC>
51void ugemm(
52 std::size_t kc, TC alpha, T const* A, T const* B,
53 TC* C, std::size_t stride1, std::size_t stride2
54){
55 BOOST_ALIGN_ASSUME_ALIGNED(A, block_size::block::align);
56 BOOST_ALIGN_ASSUME_ALIGNED(B, block_size::block::align);
57
58 typedef typename block_size::block::type vx;
59 static const std::size_t vector_length = block_size::block::vector_elements;
60 static const std::size_t vecNR = block_size::nr/vector_length;
61#ifdef REMORA_USE_SIMD
62 vx P[block_size::mr * vecNR] = {};
63#else
64 typename std::aligned_storage<sizeof(vx[block_size::mr*vecNR]),block_size::block::align>::type Pa;
65 T* P = reinterpret_cast<T*>(&Pa);
66 for (std::size_t c = 0; c < block_size::mr*vecNR; c++)
67 P[c] = 0;
68#endif
69
70
71 // perform the matrix-matrix product as outer product
72 // of rows of A and B
73 vx const* b = (vx const*)B;
74 for (std::size_t l=0; l<kc; ++l) {
75 for (std::size_t i=0; i<block_size::mr; ++i) {
76 for (std::size_t j=0; j<vecNR; ++j) {
77 P[i * vecNR+j] += A[i]*b[j];
78 }
79 }
80 A += block_size::mr;
81 b += vecNR;
82 }
83 //multiply with alpha if necessary
84 if (alpha!=TC(1)) {
85 for (std::size_t i=0; i<block_size::mr; ++i) {
86 for (std::size_t j=0; j< vecNR; ++j) {
87 P[i*vecNR+j] *= alpha;
88 }
89 }
90 }
91
92 //add result to C
93 T const* p = (T const*) P;
94 for (std::size_t i=0; i<block_size::mr; ++i) {
95 for (std::size_t j=0; j<block_size::nr; ++j) {
96 C[i * stride1+j * stride2] += p[i*block_size::nr+j];
97 }
98 }
99}
100
101
102// Macro Kernel for two densly packed Blocks
103template <class T, class TC, class block_size>
104void mgemm(
105 std::size_t mc, std::size_t nc, std::size_t kc, TC alpha,
106 T const* A, T const* B, TC *C,
107 std::size_t stride1, std::size_t stride2, block_size
108){
109 static std::size_t const MR = block_size::mr;
110 static std::size_t const NR = block_size::nr;
111 std::size_t const mp = (mc+MR-1) / MR;
112 std::size_t const np = (nc+NR-1) / NR;
113
114 for (std::size_t j=0; j<np; ++j) {
115 std::size_t const nr = std::min(NR, nc - j*NR);
116
117 for (std::size_t i=0; i<mp; ++i) {
118 std::size_t const mr = std::min(MR, mc - i*MR);
119 auto CBlockStart = C+i*MR*stride1+j*NR*stride2;
120 if (mr==MR && nr==NR) {
121 ugemm<block_size>(
122 kc, alpha,
123 &A[i*kc*MR], &B[j*kc*NR],
124 CBlockStart, stride1, stride2
125 );
126 } else {
127 TC CTempBlock[MR*NR];
128 std::fill_n(CTempBlock, MR*NR, T(0));
129 ugemm<block_size>(
130 kc, alpha,
131 &A[i*kc*MR], &B[j*kc*NR],
132 CTempBlock, NR, 1
133 );
134
135 for (std::size_t i0=0; i0<mr; ++i0){
136 for (std::size_t j0=0; j0<nr; ++j0) {
137 CBlockStart[i0*stride1+j0 * stride2] += CTempBlock[i0*NR+j0];
138 }
139 }
140 }
141 }
142 }
143}
144
145
146//-- Packing blocks ------------------------------------------------------------
147template <class E, class T, class block_size>
148void pack_A_dense(matrix_expression<E, cpu_tag> const& A, T* p, block_size)
149{
150 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
151
152 std::size_t const mc = A().size1();
153 std::size_t const kc = A().size2();
154 static std::size_t const MR = block_size::mr;
155 const std::size_t mp = (mc+MR-1) / MR;
156
157 std::size_t nu = 0;
158 for (std::size_t l=0; l<mp; ++l) {
159 for (std::size_t j=0; j<kc; ++j) {
160 for (std::size_t i = l*MR; i < l*MR + MR; ++i,++nu) {
161 p[nu] = (i<mc) ? A()(i,j) : T(0);
162 }
163 }
164 }
165}
166
167
168template <class E, class T, class block_size>
169void pack_B_dense(matrix_expression<E, cpu_tag> const& B, T* p, block_size)
170{
171 BOOST_ALIGN_ASSUME_ALIGNED(p, block_size::block::align);
172
173 std::size_t const kc = B ().size1();
174 std::size_t const nc = B ().size2();
175 static std::size_t const NR = block_size::nr;
176 std::size_t const np = (nc+NR-1) / NR;
177
178 std::size_t nu = 0;
179 for (std::size_t l=0; l<np; ++l) {
180 for (std::size_t i=0; i<kc; ++i) {
181 for (std::size_t j = l*NR; j < l*NR + NR; ++j,++nu){
182 p[nu] = (j<nc) ? B()(i,j) : T(0);
183 }
184 }
185 }
186}
187
188}}
189
190#endif