31#ifndef REMORA_KERNELS_DEFAULT_Conv2D_HPP
32#define REMORA_KERNELS_DEFAULT_Conv2D_HPP
37namespace remora{
namespace bindings {
44template<
class E1,
class E2>
46 matrix_expression<E1, cpu_tag>
const& images,
47 matrix_expression<E2, cpu_tag>& output,
48 std::size_t num_channels,
49 std::size_t image_height,
50 std::size_t image_width,
51 std::size_t filter_height,
52 std::size_t filter_width
54 static_assert(std::is_same<typename E1::orientation, row_major>::value,
"Column major not implemented");
55 static_assert(std::is_same<typename E2::orientation, row_major>::value,
"Column major not implemented");
56 std::size_t rows_per_image = (image_height - filter_height +1) * (image_width - filter_width +1);
58 for(std::size_t im = 0; im != images().size1(); ++im){
59 for(std::size_t i = 0; i != image_height - filter_height +1; ++i){
60 for(std::size_t i1 = 0; i1 != filter_height; ++i1){
61 for(std::size_t j = 0; j != image_width - filter_width +1; ++j){
62 std::size_t row_start = im * rows_per_image + i * (image_width - filter_width +1) + j;
63 for(std::size_t j1 = 0; j1 != filter_width; ++j1){
64 std::size_t col_start = (i1 * filter_width + j1) * num_channels;
65 std::size_t image_start = ((i+i1) * image_width + j+j1) * num_channels;
66 for(std::size_t c = 0; c != num_channels; ++c){
67 output()(row_start, col_start + c) = images()(im,image_start + c);
76template<
class E1,
class E2>
78 matrix_expression<E1, cpu_tag>
const& images,
79 matrix_expression<E2, cpu_tag>& output,
80 std::size_t num_channels,
81 std::size_t image_height,
82 std::size_t image_width,
83 std::size_t filter_height,
84 std::size_t filter_width,
85 std::size_t padding_height,
86 std::size_t padding_width
88 static_assert(std::is_same<typename E1::orientation, row_major>::value,
"Column major not implemented");
89 static_assert(std::is_same<typename E2::orientation, row_major>::value,
"Column major not implemented");
90 std::size_t image_start1 = padding_height/2;
91 std::size_t image_end1 = image_height + image_start1;
92 std::size_t image_start2 = padding_width/2;
93 std::size_t image_end2 = image_width + image_start2;
94 std::size_t output_width = image_width - filter_width + 1 + padding_width;
95 std::size_t output_height = image_height - filter_height + 1 + padding_height;
96 std::size_t rows_per_image =output_width * output_height;
98 for(std::size_t im = 0; im != images().size1(); ++im){
99 for(std::size_t i = 0; i != output_height; ++i){
100 for(std::size_t i1 = 0; i1 != filter_height; ++i1){
101 if(i1+i < image_start1 || i1+i >= image_end1){
102 for(std::size_t j = 0; j != output_width; ++j){
103 std::size_t row_start = im * rows_per_image + i * output_width +j;
104 std::size_t col_start = i1 * filter_width * num_channels;
105 for(std::size_t c = 0; c != num_channels * filter_width; ++c){
106 output()(row_start, col_start + c) = 0;
111 for(std::size_t j = 0; j != output_width; ++j){
112 std::size_t row_start = im * rows_per_image + i * output_width + j;
113 for(std::size_t j1 = 0; j1 != filter_width; ++j1){
114 std::size_t col_start = (i1 * filter_width + j1) * num_channels;
116 if(j+j1 < image_start2 || j+j1 >= image_end2){
117 for(std::size_t c = 0; c != num_channels; ++c){
118 output()(row_start, col_start + c) = 0;
121 std::size_t image_start = ((i+i1-image_start1) * image_width + j+j1-image_start2) * num_channels;
122 for(std::size_t c = 0; c != num_channels; ++c){
123 output()(row_start, col_start + c) = images()(im,image_start + c);
134template<
class E1,
class E2,
class M>
136 matrix_expression<E1, cpu_tag>
const& images,
137 vector_expression<E2, cpu_tag>
const& filter,
138 matrix_expression<M, cpu_tag>& outputs,
139 std::size_t num_channels,
140 std::size_t num_filters,
141 std::size_t image_height,
142 std::size_t image_width,
143 std::size_t filter_height,
144 std::size_t filter_width,
145 std::size_t padding_height,
146 std::size_t padding_width
148 static_assert(std::is_same<typename E1::orientation, row_major>::value,
"Column major not implemented");
149 static_assert(std::is_same<typename E1::storage_type::storage_tag, continuous_dense_tag>::value,
"Subranges not implemented");
150 static_assert(std::is_same<typename M::orientation, row_major>::value,
"Column major not implemented");
151 typedef typename std::common_type<
152 typename E1::value_type,
typename E2::value_type,
typename M::value_type
155 std::size_t output_rows_per_filter = (image_height - filter_height +1 + padding_height) * (image_width - filter_width +1 + padding_width);
156 std::size_t filter_size = filter_width * filter_height * num_channels;
157 std::size_t num_images = images().size1();
159 REMORA_SIZE_CHECK(outputs().size1() == images().size1());
160 REMORA_SIZE_CHECK(outputs().size2() == num_filters * output_rows_per_filter);
161 REMORA_SIZE_CHECK(images().size2() == num_channels * image_width * image_height);
162 REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size);
165 boost::alignment::aligned_allocator<value_type,64> allocator;
166 value_type* image_storage = allocator.allocate( num_images * output_rows_per_filter * filter_size);
167 value_type* filter_storage = allocator.allocate(num_filters * filter_size);
168 dense_matrix_adaptor<value_type, row_major, cpu_tag> image_transformed(image_storage,num_images * output_rows_per_filter, filter_size);
169 dense_matrix_adaptor<value_type, row_major, cpu_tag> filter_transformed(filter_storage, num_filters, filter_size);
170 dense_matrix_adaptor<value_type, row_major, cpu_tag> output_transformed(outputs().raw_storage().values, num_images * output_rows_per_filter, num_filters);
172 if(padding_height == 0 && padding_width == 0){
173 im2mat(images,image_transformed, num_channels, image_height, image_width, filter_height, filter_width);
175 im2mat_pad(images,image_transformed, num_channels, image_height, image_width, filter_height, filter_width, padding_height, padding_width);
178 for(std::size_t f = 0; f != num_filters; ++f){
179 for(std::size_t i = 0; i != filter_size; ++i){
180 filter_transformed(f,i) = filter()(f * filter_size + i);
185 kernels::gemm(image_transformed, trans(filter_transformed), output_transformed, value_type(1.0));
188 allocator.deallocate(image_storage,num_images * output_rows_per_filter * filter_size);
189 allocator.deallocate(filter_storage, num_filters * filter_size);