conv2d.cpp
Go to the documentation of this file.
1#define SHARK_USE_SIMD
2#include <shark/LinAlg/BLAS/blas.h>
4#include <shark/Core/Timer.h>
5#include <iostream>
6using namespace shark;
7using namespace std;
8
9template<class E1, class E2>
11 blas::matrix_expression<E1, blas::cpu_tag> const& image,
12 blas::matrix_expression<E2, blas::cpu_tag> const& filter,
13 std::size_t num_channels,
14 std::size_t num_filters
15){
16 std::size_t filter_size = filter().size2();
17 std::size_t image_size1 = image().size1()/num_channels;
18 std::size_t image_size2 = image().size2();
19 std::size_t output_size1 = image_size1 - filter_size +1;
20 std::size_t output_size2 = image_size2 - filter_size +1;
21 typedef typename E1::value_type value_type;
22
23 blas::matrix<value_type> out(output_size1 * num_filters, output_size2 ,0.0);
24 double minOptTime = std::numeric_limits<double>::max();
25 for(std::size_t i = 0; i != 20; ++i){
26 Timer time;
27 blas::kernels::conv2d(image,filter,out, num_channels, num_filters);
28 minOptTime = min(minOptTime,time.stop());
29 }
30
31 double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels;
32 double flops = mults /1024/1024/minOptTime;
33
34 std::cout<<output_size1<<"\t"<<filter_size<<"\t"<<num_channels<<"\t"<< num_filters<<"\t";
35 std::cout<<"\t"<<flops<< std::endl;
36}
37
38
39int main(int argc, char **argv) {
40 std::cout<<"Flops"<<std::endl;
41 std::size_t num_channels = 8;
42 std::size_t num_outputs = 16;
43 std::cout<<"performance float"<<std::endl;
44 for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
45 for(std::size_t iter = 0; iter != 6; ++iter){
46 std::size_t sizeOut1 = (3+16 * 2<<iter);
47 std::size_t sizeOut2 = (3+16 * 2<<iter);
48 std::size_t sizeIm1 = sizeOut1 + filterSize-1;
49 std::size_t sizeIm2 = sizeOut2 + filterSize-1;
50
51 blas::matrix<float> image(num_channels * sizeIm1 , sizeIm2);
52 blas::matrix<float> filter(num_channels * num_outputs * filterSize, filterSize);
53
54 for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
55 for(std::size_t j = 0; j != sizeIm2; ++j){
56 image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
57 }
58 }
59 for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
60 for(std::size_t j = 0; j != filterSize; ++j){
61 filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
62 }
63 }
64
65 benchmark(image,filter,num_channels,num_outputs);
66 }
67 }
68 num_outputs = 8;
69 std::cout<<"performance double"<<std::endl;
70 for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
71 for(std::size_t iter = 0; iter != 6; ++iter){
72 std::size_t sizeOut1 = (3+16 * 2<<iter);
73 std::size_t sizeOut2 = (3+16 * 2<<iter);
74 std::size_t sizeIm1 = sizeOut1 + filterSize-1;
75 std::size_t sizeIm2 = sizeOut2 + filterSize-1;
76
77 blas::matrix<double> image(num_channels * sizeIm1 , sizeIm2);
78 blas::matrix<double> filter(num_channels * num_outputs * filterSize, filterSize);
79
80 for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
81 for(std::size_t j = 0; j != sizeIm2; ++j){
82 image(i,j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
83 }
84 }
85 for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
86 for(std::size_t j = 0; j != filterSize; ++j){
87 filter(i,j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
88 }
89 }
90
91 benchmark(image,filter,num_channels,num_outputs);
92 }
93 }
94}