conv2d.hpp
Go to the documentation of this file.
1/*!
2 *
3 *
4 * \brief Implements the 2D convolution kernel for cpus
5 *
6 * \author O. Krause
7 * \date 2016
8 *
9 *
10 * \par Copyright 1995-2015 Shark Development Team
11 *
12 * <BR><HR>
13 * This file is part of Shark.
14 * <http://image.diku.dk/shark/>
15 *
16 * Shark is free software: you can redistribute it and/or modify
17 * it under the terms of the GNU Lesser General Public License as published
18 * by the Free Software Foundation, either version 3 of the License, or
19 * (at your option) any later version.
20 *
21 * Shark is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public License
27 * along with Shark. If not, see <http://www.gnu.org/licenses/>.
28 *
29 */
30
31#ifndef REMORA_KERNELS_CLBLAST_CONV2D_HPP
32#define REMORA_KERNELS_CLBLAST_CONV2D_HPP
33
34#include "../../expression_types.hpp"
35#include "../../detail/traits.hpp"
36#include <clblast.h>
37namespace remora{namespace bindings {
38
39template<class E1, class E2, class M>
40void conv2d(
41 matrix_expression<E1, gpu_tag> const& images,
42 vector_expression<E2, gpu_tag> const& filter,
43 matrix_expression<M, gpu_tag>& outputs,
44 std::size_t num_channels,
45 std::size_t num_filters,
46 std::size_t image_height,
47 std::size_t image_width,
48 std::size_t filter_height,
49 std::size_t filter_width,
50 std::size_t padding_height,
51 std::size_t padding_width
52){
53 static_assert(std::is_same<typename E1::orientation, row_major>::value, "[conv2d] Column major not implemented");
54 static_assert(std::is_same<typename E1::storage_type::storage_tag, continuous_dense_tag>::value, "[conv2d] Subranges not implemented");
55 static_assert(std::is_same<typename M::orientation, row_major>::value, "[conv2d] Column major not implemented");
56
57 static_assert(std::is_same<typename E1::value_type, typename E2::value_type>::value, "[conv2d] Arguments do not have same value type");
58 static_assert(std::is_same<typename E1::value_type, typename M::value_type>::value, "[conv2d] Arguments do not have same value type");
59
60 static_assert(std::is_base_of<dense_tag, typename E1::evaluation_category::tag>::value, "[conv2d] images is not dense");
61 static_assert(std::is_base_of<continuous_dense_tag, typename E2::storage_type::storage_tag>::value, "[conv2d] filter does not have continuous dense storage layout");
62 static_assert(std::is_base_of<continuous_dense_tag, typename M::storage_type::storage_tag>::value, "[conv2d] outputs does not have dense storage layout");
63
64
65 typedef typename E1::value_type value_type;
66
67 //pre-evaluate images into a temporary if necessary
68 auto const& images_eval = eval_expression(images);
69
70 //~ if(padding_height != 0 || padding_width != 0){
71 //~ //note: the below is simply creating a temporary matrix and copying the image into the subranges.
72 //~ //However we can not use matrix proxy expressions directly and instead need to rely on this slightly
73 //~ //nasty loop
74 //~ auto context = outputs().queue().get_context();
75 //~ std::size_t padded_width = image_width+padding_width;
76 //~ std::size_t paddedSize = padded_width * (image_height + padding_height) * num_channels;
77 //~ //create storage for the image
78 //~ gpu::dense_matrix_storage<value_type, dense_tag> storage = {{context, images().size1() * paddedSize},0,paddedSize};
79 //~ dense_matrix_adaptor<value_type, row_major, gpu_tag> padded_images(storage,images().size1(), paddedSize);
80 //~ adaptor.clear();//initialize padding to 0.
81 //~ for(std::size_t im = 0; im != images().size1(); ++im){
82 //~ //cut out the i-th image starting at the first non-zero element
83 //~ //i.e. we skip the initial 0-rows and the first few nonzero element of the first real rows
84 //~ std::size_t offset = im * paddedSize + ((padding_height/2) * padded_width + padding_width/2) * num_channels;
85 //~ gpu::dense_matrix_storage<value_type, dense_tag> sub_storage = {storage.context, offset, padded_width * num_channels};
86 //~ dense_matrix_adaptor<value_type, row_major, gpu_tag> sub_image(sub_storage,image_height, image_width * num_channels);
87 //~ noalias(sub_image) = row(images,im);
88 //~ }
89
90 //~ conv2d(
91 //~ padded_images,filter,outputs,
92 //~ num_channels, num_filters,
93 //~ )
94 //~ }
95
96
97
98 std::size_t output_height = (image_height - filter_height +1 + padding_height);
99 std::size_t output_width = (image_width - filter_width +1 + padding_width);
100 std::size_t filter_size = filter_width * filter_height * num_channels;
101 std::size_t num_images = images().size1();
102
103 REMORA_SIZE_CHECK(outputs().size1() == images().size1());
104 REMORA_SIZE_CHECK(outputs().size2() == num_filters * output_width * output_height);
105 REMORA_SIZE_CHECK(images().size2() == num_channels * image_width * image_height);
106 REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size);
107
108 // for this implementation, we use the GemmBatched extension to BLAS implemented by clBlast
109 // GemmBatched allows to perform a set of matrix-matrix multiplications
110 // C_i= alpha_i * C_i + beta_i * A_i * B_i
111 // in parallel.
112 // to use this, note that our filter in memory layout is stored as
113 // num_filters x filter_height x filter_width x num_channels
114 // where num_channels is the fastest index (i.e. continuous in memory)
115 // and num_filters the slowest. The dense layout allows us to
116 // get one matrix-sized slice of this.
117 // We know that the (filter_width x num_channels)-matrix is continuous
118 // in memory in both filter and image. So we can linearize this part
119 // and treat it as (filter_width * num_channels)-dim vector.
120 // Then we can fix the filter_height variable to obtain a 3-tensor
121 // which is linearized a matrix of size num_filters x (filter_width * num_channels).
122 // like this, we obtain one 3-tensor for each value of filter_height, this is usually not too large.
123 // this allows in the image to extract fitting slices of size (filter_width * num_channels)
124 // simply by moving the vector element by element over the matrix.
125 // we can again parallelize over the images to obtain sub-matrices of size
126 // num_images x (filter_width * num_channels).
127 // like this, for each filter slice, we obtain output_width * output_height
128 // matrix-matrix multiplications which are then repeated over each slice of the filter.
129 //We compute slices one-after another and add them up on the outputs matrix.
130
131 //obtain matrix storage
132 auto storage_images = images_eval.raw_storage();
133 auto storage_filter = filter().raw_storage();
134 auto storage_outputs = outputs().raw_storage();
135
136 //setup required constants for offsets
137 std::size_t num_multiplications = output_width * output_height;
138 std::vector<value_type> alphas(num_multiplications,value_type(1));
139 std::vector<value_type> const& betas = alphas;
140 //outputs offsets are constant
141 std::vector<std::size_t> outputs_offsets(num_multiplications);
142 std::vector<std::size_t> im_offsets(num_multiplications,0);
143 std::vector<std::size_t> filter_offsets(num_multiplications,0);
144 for(std::size_t i = 0; i != output_height; ++i){
145 for(std::size_t j = 0; j != output_width; ++j){
146 std::size_t index = i * output_width + j;
147 outputs_offsets[index] = index * num_filters;
148 }
149 }
150
151 for(std::size_t k = 0; k != filter_height; ++k){
152
153 for(std::size_t i = 0; i != output_height; ++i){
154 for(std::size_t j = 0; j != output_width; ++j){
155 std::size_t index = i * output_width + j;
156 im_offsets[index] = ((i+k) * image_width + j) * num_channels;
157 }
158 }
159 //call GemmBatches
160 using namespace clblast;
161 cl_event* event = nullptr;//todo: store events for out-of-order queues
162 auto status = GemmBatched<value_type>(
163 Layout::kRowMajor, Transpose::kNo, Transpose::kYes,
164 num_images, num_filters, num_channels * filter_width,
165 alphas.data(),
166 storage_images.buffer.get(), im_offsets.data(), storage_images.leading_dimension,
167 storage_filter.buffer.get(), filter_offsets.data(), filter_size,
168 betas.data(),
169 storage_outputs.buffer.get(), outputs_offsets.data(), storage_outputs.leading_dimension,
170 num_multiplications,
171 &outputs().queue().get(), event
172 );
173 //~ std::cout<<(int)status<<std::endl;
174 assert(status == StatusCode::kSuccess);
175 if(k < filter_height -1){
176 for(auto& offset: filter_offsets){
177 offset += num_channels * filter_width;
178 }
179 }
180 }
181
182
183}
184
185}}
186
187#endif