gemm.hpp
Go to the documentation of this file.
1//===========================================================================
2/*!
3 *
4 *
5 * \brief -
6 *
7 * \author O. Krause
8 * \date 2016
9 *
10 *
11 * \par Copyright 1995-2015 Shark Development Team
12 *
13 * <BR><HR>
14 * This file is part of Shark.
15 * <http://image.diku.dk/shark/>
16 *
17 * Shark is free software: you can redistribute it and/or modify
18 * it under the terms of the GNU Lesser General Public License as published
19 * by the Free Software Foundation, either version 3 of the License, or
20 * (at your option) any later version.
21 *
22 * Shark is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU Lesser General Public License for more details.
26 *
27 * You should have received a copy of the GNU Lesser General Public License
28 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
29 *
30 */
31//===========================================================================
32#ifndef REMORA_KERNELS_GPU_GEMM_HPP
33#define REMORA_KERNELS_GPU_GEMM_HPP
34
35#include "../../expression_types.hpp"
36#include "../../detail/traits.hpp"
37#include <boost/compute/functional/operator.hpp> //for multiplies
38
39namespace remora{ namespace kernels{
40
41// C <- alpha * A * B + beta * C
42template <typename MatA, typename MatB, typename MatC>
43void gemm(
44 matrix_expression<MatA, gpu_tag> const& A_unreg,
45 matrix_expression<MatB, gpu_tag> const& B_unreg,
46 matrix_expression<MatC, gpu_tag>& C_unreg,
47 typename MatC::value_type const& alpha
48) {
49 REMORA_SIZE_CHECK(A_unreg().size1() == C_unreg().size1());
50 REMORA_SIZE_CHECK(B_unreg().size2() == C_unreg().size2());
51 REMORA_SIZE_CHECK(A_unreg().size2()== B_unreg().size1());
52
53 // TUNING VARIABLES:
54 // TILE_SIZE: width and height of a tile computed in C
55 // TILE_SIZE_K: length of Tile loaded from A and C, i.e A and C are loaded as (TILE_SIZE, TILE_SIZE_K) matrices
56 // BLOCK_SIZE: size of a subblock computed by a single work item (must divide TILE_SIZE)
57 // it holds TILE_SIZE = get_local_size(i) * BLOCK_SIZE for i=0,1
58 // note that BLOCK_SIZE increases the number of registers needed per thread.
59 // roughly O(BLOCK_SIZE^2) registers are needed per thread (+ overhead). Note that register spill might be deadly to performance.
60 // local memory is TILE_SIZE*TILE_SIZE_K*2*sizeof(MatC::value_type)
61 // increasing TILE_SIZE improves the trade-off "computations per memory access"
62 std::size_t BLOCK_SIZE = 4;
63 std::size_t TILE_SIZE = 32;
64 std::size_t NUM_WORKERS = TILE_SIZE / BLOCK_SIZE;
65 //~ std::size_t TILE_SIZE_K = 16;
66 char const* options ="-DTILE_SIZE=32ul -DBLOCK_SIZE=4ul -DTILE_SIZE_K=16ul";
67 typedef typename MatC::value_type value_type;
68
69 gpu::detail::meta_kernel k("blas_gemm");
70 std::size_t M_index = k.add_arg<std::size_t>("M");
71 std::size_t N_index = k.add_arg<std::size_t>("N");
72 std::size_t K_index = k.add_arg<std::size_t>("K");
73 std::size_t alpha_index = k.add_arg<value_type>("alpha");
74 auto A = k.register_args(to_functor(A_unreg));
75 auto B = k.register_args(to_functor(B_unreg));
76 auto C = k.register_args(to_functor(C_unreg));
77
78
79 // Local memory to fit a tile of A and B
80 // we transpose A locally in memory
81 k << "__local " <<k.decl<value_type>("Asub")<< "[TILE_SIZE_K][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
82 k << "__local " <<k.decl<value_type>("Bsub")<< "[TILE_SIZE_K][TILE_SIZE+2];\n";//+2 to avoid bank conflicts
83 k << " const ulong numWorkers = get_local_size(0);\n";
84 // Initialise the accumulation registers
85 // here the subblock of C for this thread is stored
86 // blocks ae not continuous but strided so that
87 // coalesced write to C is possible
88 // e.g. with 8x8 threads and BLOCK_SIZE 2x2 thread 1 has local tile elements
89 //(0,0) (0,8) (8,0), (8,8). all other blocks are calculated
90 // by adding (local_id(0), local_id(1)) to the coordinates
91 k << k.decl<value_type>("acc") <<"[BLOCK_SIZE][BLOCK_SIZE];\n";
92 k << "for (ulong wm=0; wm<BLOCK_SIZE; wm++){\n";
93 k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
94 k << " acc[wm][wn] = 0.0f;\n";
95 k << " }\n";
96 k << "}\n";
97
98
99 // Loop over all tiles
100 k << "ulong numTiles = (K+TILE_SIZE_K-1)/TILE_SIZE_K;\n";
101 k << "for (ulong t=0; t<numTiles; t++){\n";
102
103 //ensure we are not reading out of bounds in K.
104 k << " const ulong curTileK = min(TILE_SIZE_K, K - t*TILE_SIZE_K);\n";
105
106 // Load one tile of A and B transposed ulongo local memory using padding
107 k << " for(ulong i = get_local_id(0); i < TILE_SIZE; i += numWorkers){\n";
108 k << " for(ulong k = get_local_id(1); k < curTileK; k += numWorkers){\n";
109 k << " ulong ktile = t * TILE_SIZE_K + k;\n";
110 k << " Asub[k][i] ="<< A(k.expr<cl_ulong>("min(M-1,TILE_SIZE * get_group_id(0)+i)"),k.expr<cl_ulong>("ktile"))<<";\n";
111 k << " Bsub[k][i] ="<< B(k.expr<cl_ulong>("ktile"),k.expr<cl_ulong>("min(N-1,TILE_SIZE * get_group_id(1)+i)"))<<";\n";
112 k << " }\n";
113 k << " }\n";
114
115 // Synchronise to make sure the tile is loaded
116 k << " barrier(CLK_LOCAL_MEM_FENCE);\n";
117
118 // Loop over the values of a single tile
119 // by computing outer products ulongo the local accumulation registers acc
120 k << " for (ulong k=0; k<curTileK; k++){\n";
121 // Cache the values of Bsub in registers to save local memory lookups
122 k << k.decl<value_type>("Breg")<<"[BLOCK_SIZE];\n";
123 k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
124 k << " Breg[wn] = Bsub[k][get_local_id(1) + wn * numWorkers];\n";
125 k << " }\n";
126
127 // Perform the computation
128 k << " for (ulong wm = 0; wm<BLOCK_SIZE; wm++){\n";
129 k << k.decl<value_type>("Areg") << "= Asub[k][get_local_id(0) + wm * numWorkers];\n";
130 k << " for (ulong wn=0; wn<BLOCK_SIZE; wn++){\n";
131 k << " acc[wm][wn] += Areg * Breg[wn];\n";
132 k << " }\n";
133 k << " }\n";
134 k << " }\n";
135
136 // Synchronise before loading the next tile
137 k << " barrier(CLK_LOCAL_MEM_FENCE);\n";
138 k << "}\n";
139
140 // Store the final results in C
141 k << "const ulong maxCi = min(TILE_SIZE, M - get_group_id(0) * TILE_SIZE);\n";
142 k << "const ulong maxCj = min(TILE_SIZE, N - get_group_id(1) * TILE_SIZE);\n";
143 k << "const ulong offTileCi = TILE_SIZE * get_group_id(0);\n";
144 k << "const ulong offTileCj = TILE_SIZE * get_group_id(1);\n";
145 k << "ulong wm = 0;\n";
146 k << "for (ulong i = get_local_id(0); i < maxCi; i += numWorkers, wm++){\n";
147 k << " ulong wn = 0;\n";
148 k << " for (ulong j =get_local_id(1); j < maxCj; j += numWorkers, wn++){\n";
149 k << C(k.expr<cl_ulong>("(offTileCi + i)"), k.expr<cl_ulong>("(offTileCj + j)")) << "+= alpha * acc[wm][wn];\n";
150 k << " }\n";
151 k << "}\n";
152
153 boost::compute::kernel kernel = k.compile(C_unreg().queue().get_context(), options);
154
155 //enqueue kernel with kernel args
156 kernel.set_arg(M_index, C_unreg().size1());
157 kernel.set_arg(N_index, C_unreg().size2());
158 kernel.set_arg(K_index, A_unreg().size2());
159 kernel.set_arg(alpha_index, alpha);
160
161 std::size_t global_work_size[2] = {
162 (C_unreg().size1()+TILE_SIZE-1)/ TILE_SIZE * NUM_WORKERS,
163 (C_unreg().size2()+TILE_SIZE-1)/ TILE_SIZE * NUM_WORKERS
164 };
165 std::size_t local_work_size[2] = {NUM_WORKERS, NUM_WORKERS};
166 C_unreg().queue().enqueue_nd_range_kernel(kernel, 2,nullptr, global_work_size, local_work_size);
167}
168
169}}
170
171#endif