11    blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg, 
 
   13    Shape const& patchSize,
 
   14    blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> outputs_unreg
 
   17    std::size_t depth = shape[2];
 
   18    std::size_t outputHeight = shape[0]/patchSize[0];
 
   19    std::size_t outputWidth = shape[1]/patchSize[1];
 
   20    std::size_t outputPixels = outputWidth * outputHeight;
 
   21    SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
 
   22    SIZE_CHECK(outputs_unreg.size2() == outputPixels * depth);
 
   23    SIZE_CHECK(outputs_unreg.size1() == inputs_unreg.size1());
 
   25    blas::gpu::detail::meta_kernel k(
"shark_max_pooling");
 
   26    std::size_t width_index = k.add_arg<std::size_t>(
"width");
 
   27    std::size_t height_index = k.add_arg<std::size_t>(
"height");
 
   28    std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
 
   29    std::size_t sizeH_index = k.add_arg<std::size_t>(
"sizeH");
 
   30    std::size_t sizeW_index = k.add_arg<std::size_t>(
"sizeW");
 
   31    std::size_t numImages_index = k.add_arg<std::size_t>(
"numImages");
 
   32    auto inputs = k.register_args(to_functor(inputs_unreg));
 
   33    auto outputs = k.register_args(to_functor(outputs_unreg));
 
   35    k << 
"const ulong outputWidth = width / sizeW;\n";
 
   36    k << 
"const ulong outputHeight = height / sizeH;\n";
 
   37    k << 
"const ulong numOutputs = outputWidth * outputHeight;\n";
 
   38    k << 
"const ulong id = get_global_id(0);\n";
 
   39    k << 
"if(id >= numImages * numOutputs) return;\n"; 
 
   41    k << 
"const ulong im = id / numOutputs;\n";
 
   42    k << 
"const ulong p = id % numOutputs;\n";
 
   45    k << 
"const ulong starti = (p / outputWidth) * sizeH;\n";
 
   46    k << 
"const ulong startj = (p % outputWidth) * sizeW;\n";
 
   47    k << 
"const ulong endi = starti + sizeH;\n";
 
   48    k << 
"const ulong endj = startj + sizeW;\n";
 
   49    k << 
"for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
 
   50    k << 
"  ulong index = (starti * width + startj) * depth +c;\n";
 
   51    auto im = k.expr<cl_ulong>(
"im");
 
   52    auto index = k.expr<cl_ulong>(
"index");
 
   54    k << 
"  " << k.decl<T>(
"val") <<
" = "<< inputs(im, index) << 
";\n";
 
   55    k << 
"  for(ulong i = starti; i != endi; ++i){\n";
 
   56    k << 
"      for(ulong j = startj; j != endj; ++j){\n";
 
   57    k << 
"          index =  (i * width + j) * depth + c;\n";
 
   58    k << 
"          val = max(val,"<<inputs(im, index)<<
");\n";
 
   61    k << 
"  " << outputs(im, k.expr<cl_ulong>(
"(p * depth + c)"))<<
" = val;\n";
 
   65    boost::compute::kernel kernel = k.compile(outputs_unreg.queue().get_context());
 
   68    kernel.set_arg(height_index, shape[0]);
 
   69    kernel.set_arg(width_index, shape[1]);
 
   70    kernel.set_arg(depth_index, shape[2]);
 
   71    kernel.set_arg(sizeH_index, patchSize[0]);
 
   72    kernel.set_arg(sizeW_index, patchSize[1]);
 
   73    kernel.set_arg(numImages_index, inputs_unreg.size1());
 
   76    std::size_t local_work_size[2] = {8, 4};
 
   78    std::size_t global_work_size[2] = {
 
   79        ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] , 
 
   80        ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1] 
 
   82    outputs_unreg.queue().enqueue_nd_range_kernel(kernel, 2, 
nullptr, global_work_size, local_work_size);
 
 
   87    blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> inputs_unreg, 
 
   88    blas::dense_matrix_adaptor<T const, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> coefficients_unreg, 
 
   90    Shape const& patchSize,
 
   91    blas::dense_matrix_adaptor<T, blas::row_major, blas::continuous_dense_tag, blas::gpu_tag> derivatives_unreg
 
   93    derivatives_unreg.clear();
 
   94    std::size_t depth = shape[2];
 
   95    std::size_t outputHeight = shape[0]/patchSize[0];
 
   96    std::size_t outputWidth = shape[1]/patchSize[1];
 
   97    std::size_t outputPixels = outputWidth * outputHeight;
 
   98    SIZE_CHECK(derivatives_unreg.size2() == shape[0] * shape[1] * depth);
 
   99    SIZE_CHECK(inputs_unreg.size2() == shape[0] * shape[1] * depth);
 
  100    SIZE_CHECK(coefficients_unreg.size2() == outputPixels * depth);
 
  101    SIZE_CHECK(derivatives_unreg.size1() == inputs_unreg.size1());
 
  102    SIZE_CHECK(derivatives_unreg.size1() == coefficients_unreg.size1());
 
  104    blas::gpu::detail::meta_kernel k(
"shark_max_pooling_derivative");
 
  105    std::size_t width_index = k.add_arg<std::size_t>(
"width");
 
  106    std::size_t height_index = k.add_arg<std::size_t>(
"height");
 
  107    std::size_t depth_index = k.add_arg<std::size_t>(
"depth");
 
  108    std::size_t sizeH_index = k.add_arg<std::size_t>(
"sizeH");
 
  109    std::size_t sizeW_index = k.add_arg<std::size_t>(
"sizeW");
 
  110    std::size_t numImages_index = k.add_arg<std::size_t>(
"numImages");
 
  111    auto inputs = k.register_args(to_functor(inputs_unreg));
 
  112    auto coefficients = k.register_args(to_functor(coefficients_unreg));
 
  113    auto derivatives = k.register_args(to_functor(derivatives_unreg));
 
  115    k << 
"const ulong outputWidth = width / sizeW;\n";
 
  116    k << 
"const ulong outputHeight = height / sizeH;\n";
 
  117    k << 
"const ulong numOutputs = outputWidth * outputHeight;\n";
 
  118    k << 
"const ulong id = get_global_id(0);\n";
 
  119    k << 
"if(id >= numImages * numOutputs) return;\n"; 
 
  121    k << 
"const ulong im = id / numOutputs;\n";
 
  122    k << 
"const ulong p = id % numOutputs;\n";
 
  125    k << 
"const ulong starti = (p / outputWidth) * sizeH;\n";
 
  126    k << 
"const ulong startj = (p % outputWidth) * sizeW;\n";
 
  127    k << 
"const ulong endi = starti + sizeH;\n";
 
  128    k << 
"const ulong endj = startj + sizeW;\n";
 
  129    k << 
"for(ulong c = get_local_id(1); c < depth; c += get_local_size(1)){\n";
 
  130    k << 
"  ulong index = (starti * width + startj) * depth +c;\n";
 
  131    auto im = k.expr<cl_ulong>(
"im");
 
  132    auto index = k.expr<cl_ulong>(
"index");
 
  134    k << 
"  " << k.decl<T>(
"maxVal")<<
" = " <<inputs(im, index) << 
";\n";
 
  135    k << 
"  ulong maxIndex = index;\n";
 
  136    k << 
"  for(ulong i = starti; i != endi; ++i){\n";
 
  137    k << 
"      for(ulong j = startj; j != endj; ++j){\n";
 
  138    k << 
"          index =  (i * width + j) * depth + c;\n";
 
  139    k << 
"          if("<<inputs(im, index)<<
" > maxVal){\n";
 
  140    k << 
"              maxVal = "<<inputs(im, index)<<
";\n";
 
  141    k << 
"              maxIndex = index;\n";
 
  145    k << 
"  " << derivatives(im, k.expr<cl_ulong>(
"maxIndex")) << 
" = " << coefficients(im,k.expr<cl_ulong>(
"(p * depth +c)")) << 
";\n";
 
  149    boost::compute::kernel kernel = k.compile(derivatives_unreg.queue().get_context());
 
  152    kernel.set_arg(height_index, shape[0]);
 
  153    kernel.set_arg(width_index, shape[1]);
 
  154    kernel.set_arg(depth_index, shape[2]);
 
  155    kernel.set_arg(sizeH_index, patchSize[0]);
 
  156    kernel.set_arg(sizeW_index, patchSize[1]);
 
  157    kernel.set_arg(numImages_index, inputs_unreg.size1());
 
  160    std::size_t local_work_size[2] = {8, 4};
 
  162    std::size_t global_work_size[2] = {
 
  163        ((inputs_unreg.size1() * outputPixels + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0] , 
 
  164        ((depth + local_work_size[1] - 1) / local_work_size[1]) * local_work_size[1]
 
  166    derivatives_unreg.queue().enqueue_nd_range_kernel(kernel, 2, 
nullptr, global_work_size, local_work_size);