33#ifndef REMORA_GPU_TRAITS_HPP
34#define REMORA_GPU_TRAITS_HPP
36#include <boost/compute/command_queue.hpp>
37#include <boost/compute/core.hpp>
38#include <boost/compute/container/vector.hpp>
39#include <boost/compute/functional/operator.hpp>
40#include <boost/compute/functional.hpp>
42namespace remora{
namespace gpu{
44template<
class T,
class Tag>
45struct dense_vector_storage{
46 typedef Tag storage_tag;
48 boost::compute::buffer buffer;
52 dense_vector_storage(){}
53 dense_vector_storage(boost::compute::buffer
const& buffer, std::size_t offset, std::size_t stride)
54 :buffer(buffer), offset(offset), stride(stride){}
55 template<
class U,
class Tag2>
56 dense_vector_storage(dense_vector_storage<U, Tag2>
const& storage):
57 buffer(storage.buffer), offset(storage.offset), stride(storage.stride){
58 static_assert(std::is_convertible<U&, T&>::value,
"incompatible storage");
59 static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value),
"Trying to assign dense to continuous dense storage");
62 dense_vector_storage<T,Tag> sub_region(std::size_t offset)
const{
63 return {buffer, this->offset+offset * stride, stride};
67template<
class T,
class Tag>
68struct dense_matrix_storage{
69 typedef Tag storage_tag;
71 struct row_storage:
public std::conditional<
72 std::is_same<O,row_major>::value,
73 dense_vector_storage<T, Tag>,
74 dense_vector_storage<T, dense_tag>
77 struct rows_storage:
public std::conditional<
78 std::is_same<O,row_major>::value,
79 dense_matrix_storage<T, Tag>,
80 dense_matrix_storage<T, dense_tag>
83 typedef dense_vector_storage<T,Tag> diag_storage;
84 typedef dense_matrix_storage<T,dense_tag> sub_region_storage;
86 boost::compute::buffer buffer;
88 std::size_t leading_dimension;
90 dense_matrix_storage(){}
91 dense_matrix_storage(boost::compute::buffer
const& buffer, std::size_t offset, std::size_t leading_dimension)
92 :buffer(buffer), offset(offset), leading_dimension(leading_dimension){}
94 template<
class U,
class Tag2>
95 dense_matrix_storage(dense_matrix_storage<U, Tag2>
const& storage):
96 buffer(storage.buffer), offset(storage.offset), leading_dimension(storage.leading_dimension){
97 static_assert(std::is_convertible<U&, T&>::value,
"incompatible storage");
98 static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value),
"Trying to assign dense to continuous dense storage");
101 template<
class Orientation>
102 sub_region_storage sub_region(std::size_t offset1, std::size_t offset2, Orientation)
const{
103 std::size_t offset_major = Orientation::index_M(offset1,offset2);
104 std::size_t offset_minor = Orientation::index_m(offset1,offset2);
105 return {buffer, offset + offset_major*leading_dimension+offset_minor, leading_dimension};
108 template<
class Orientation>
109 typename row_storage<Orientation>::type row(std::size_t i, Orientation)
const{
110 return {buffer, offset + i * Orientation::index_M(leading_dimension,std::size_t(1)), Orientation::index_m(leading_dimension,std::size_t(1))};
113 template<
class Orientation>
114 typename rows_storage<Orientation>::type sub_rows(std::size_t i, Orientation)
const{
115 std::size_t stride = Orientation::index_M(leading_dimension,(std::size_t)1);
116 return {buffer,offset + i * stride, leading_dimension};
120 return {buffer, offset, leading_dimension+1};
123 dense_vector_storage<T, continuous_dense_tag> linear()
const{
124 return {buffer, offset, 1};
134template<
class T,
class Stored = T>
135struct invoked_constant{
139template<
class Arg1,
class T,
char Op,
class Stored>
140struct invoked_operator_scalar{
141 typedef T result_type;
146template<
class Arg1,
class T,
class Stored = T>
147struct invoked_add_scalar{
148 typedef T result_type;
153template<
class Arg1,
class Arg2,
class T,
class Stored=T>
154struct invoked_multiply_and_add{
155 typedef T result_type;
161template<
class Arg1,
class T>
162struct invoked_soft_plus{
163 typedef T result_type;
166template<
class Arg1,
class T>
167struct invoked_sigmoid{
168 typedef T result_type;
172template<
class Arg1,
class T>
174 typedef T result_type;
178template<
class Arg1,
class T>
180 typedef T result_type;
184template<
class Arg1,
class Arg2,
class T,
class S>
185struct invoked_safe_div{
186 typedef T result_type;
193template<
class Arg1,
class T,
char Op,
class S>
194boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_operator_scalar<Arg1,T, Op, S>
const& e){
195 return k <<
'('<<e.arg1 << Op << e.m_scalar<<
')';
197template<
class Arg1,
class Arg2,
class T,
class S>
198boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_multiply_and_add<Arg1,Arg2,T, S>
const& e){
199 return k <<
'('<<e.arg1<<
'+'<<e.m_scalar <<
'*'<< e.arg2<<
')';
201template<
class Arg1,
class T>
202boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_soft_plus<Arg1,T>
const& e){
203 return k <<
"(log(1+exp("<< e.arg1<<
")))";
205template<
class Arg1,
class T>
206boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_sigmoid<Arg1,T>
const& e){
207 return k <<
"(1/(1+exp(-"<< e.arg1<<
")))";
209template<
class Arg1,
class T>
210boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_sqr<Arg1,T>
const& e){
211 return k <<
'('<<e.arg1<<
'*'<<e.arg1<<
')';
213template<
class Arg1,
class T>
214boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_inv<Arg1,T>
const& e){
215 return k <<
"1/("<<e.arg1<<
')';
218template<
class T,
class S>
219boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_constant<T, S>
const& e){
220 return k << e.m_value;
224template<
class Arg1,
class Arg2,
class T,
class S>
225boost::compute::detail::meta_kernel&
operator<<(boost::compute::detail::meta_kernel& k, invoked_safe_div<Arg1,Arg2,T, S>
const& e){
226 return k <<
"(("<<e.arg2<<
"!=0)?"<<e.arg1<<
'/'<<e.arg2<<
':'<<e.default_value<<
')';
233struct device_traits<gpu_tag>{
234 typedef boost::compute::command_queue queue_type;
236 static queue_type& default_queue(){
237 return boost::compute::system::default_queue();
242 template <
class Iterator,
class Functor>
243 struct transform_iterator{
244 typedef no_iterator type;
247 template <
class Iterator1,
class Iterator2,
class Functor>
248 struct binary_transform_iterator{
249 typedef no_iterator type;
253 struct constant_iterator{
254 typedef no_iterator type;
258 struct one_hot_iterator{
259 typedef no_iterator type;
262 template<
class Closure>
263 struct indexed_iterator{
264 typedef no_iterator type;
270 template<
class F,
class G>
272 typedef typename G::result_type result_type;
273 compose(F
const& f, G
const& g): m_f(f), m_g(g){ }
276 auto operator()( Arg1
const& x)
const ->
decltype(std::declval<G const&>()(std::declval<F const&>()(x))){
279 template<
class Arg1,
class Arg2>
280 auto operator()( Arg1
const& x, Arg2
const& y)
const ->
decltype(std::declval<G const&>()(std::declval<F const&>()(x,y))){
281 return m_g(m_f(x,y));
289 template<
class F1,
class F2,
class G>
290 struct compose_binary{
291 typedef typename G::result_type result_type;
292 compose_binary(F1
const& f1, F2
const& f2, G
const& g): m_f1(f1), m_f2(f2), m_g(g){ }
295 auto operator()( Arg1
const& x)
const ->
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(x))){
296 return m_g(m_f1(x), m_f2(x));
298 template<
class Arg1,
class Arg2>
299 auto operator()( Arg1
const& x, Arg2
const& y)
const ->
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x,y),std::declval<F2 const&>()(x,y))){
300 return m_g(m_f1(x,y), m_f2(x,y));
310 template<
class F1,
class F2,
class G>
311 struct transform_arguments{
312 typedef typename G::result_type result_type;
313 transform_arguments(F1
const& f1, F2
const& f2, G
const& g): m_f1(f1), m_f2(f2), m_g(g){ }
315 template<
class Arg1,
class Arg2>
316 auto operator()( Arg1
const& x, Arg2
const& y)
const ->
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(y))){
317 return m_g(m_f1(x),m_f2(y));
325 template<
class F,
class Arg2>
327 typedef typename F::result_type result_type;
328 bind_second(F
const& f, Arg2
const& arg2) : m_function(f), m_arg2(arg2){ }
331 auto operator()(Arg1
const& arg1)
const ->
decltype(std::declval<F const&>()(arg1,std::declval<Arg2 const&>()))
333 return m_function(arg1, m_arg2);
342 template<
class F,
class G>
343 static compose<F,G> make_compose(F
const& f, G
const&g){
344 return compose<F,G>(f,g);
347 template<
class F1,
class F2,
class G>
348 static compose_binary<F1, F2, G> make_compose_binary(F1
const& f1, F2
const& f2, G
const&g){
349 return compose_binary<F1, F2, G>(f1, f2, g);
352 template<
class F1,
class F2,
class G>
353 static transform_arguments<F1, F2, G> make_transform_arguments(F1
const& f1, F2
const& f2, G
const& g){
354 return transform_arguments<F1, F2, G>(f1, f2, g);
357 template<
class F,
class Arg2>
358 static bind_second<F,Arg2> make_bind_second(F
const& f, Arg2
const& arg2){
359 return bind_second<F,Arg2>(f,arg2);
367 using add = boost::compute::plus<T>;
369 using subtract = boost::compute::minus<T>;
371 using multiply = boost::compute::multiplies<T>;
373 using divide = boost::compute::divides<T>;
375 using modulo = boost::compute::modulus<T>;
377 using pow = boost::compute::pow<T>;
378 template<
class T,
class S=T>
380 typedef T result_type;
381 safe_divide(
S const& default_value) : default_value(default_value) { }
383 template<
class Arg1,
class Arg2>
384 gpu::detail::invoked_safe_div<Arg1,Arg2, T,S> operator()(
const Arg1 &x,
const Arg2& y)
const
386 return {x,y,default_value};
390 template<
class T,
class S= T>
391 struct multiply_and_add{
392 typedef T result_type;
393 multiply_and_add(
S const& scalar) :m_scalar(scalar) { }
395 template<
class Arg1,
class Arg2>
396 gpu::detail::invoked_multiply_and_add<Arg1,Arg2,T,S> operator()(
const Arg1 &x,
const Arg2& y)
const
398 return {x,y, m_scalar};
404 template<
class T,
char Op,
class S>
405 struct operator_scalar{
406 typedef T result_type;
407 operator_scalar(
S const& scalar) : m_scalar(scalar) { }
410 gpu::detail::invoked_operator_scalar<Arg1,T, Op, S> operator()(Arg1
const& x)
const
412 return {x, m_scalar};
418 using multiply_scalar = operator_scalar<T,
'*', T>;
420 using add_scalar = operator_scalar<T,
'+', T>;
422 using divide_scalar = operator_scalar<T,
'/', T>;
424 using modulo_scalar = operator_scalar<T,
'%', T>;
426 template<
class T,
class S=T>
427 struct multiply_assign{
428 typedef T result_type;
429 multiply_assign(
S const& scalar): m_scalar(scalar) { }
431 template<
class Arg1,
class Arg2>
432 gpu::detail::invoked_operator_scalar<Arg2,T,
'*',
S> operator()(
const Arg1&,
const Arg2& y)
const
434 return {y, m_scalar};
440 typedef T result_type;
443 Arg
const& operator()(Arg
const& arg)
const{
450 typedef T result_type;
452 template<
class Arg1,
class Arg2>
453 Arg1
const& operator()(Arg1
const& arg1, Arg2
const&)
const{
459 typedef T result_type;
461 template<
class Arg1,
class Arg2>
462 Arg2
const& operator()(Arg1
const&, Arg2
const& arg2)
const{
467 template<
class T,
class S=T>
469 typedef T result_type;
470 constant(
S const& value): m_value(value){}
473 gpu::detail::invoked_constant<T,S> operator()(Arg
const&)
const
477 template<
class Arg1,
class Arg2>
478 gpu::detail::invoked_constant<T,S> operator()(Arg1
const&, Arg2
const&)
const
489 using log = boost::compute::log<T>;
491 using exp = boost::compute::exp<T>;
493 using sin = boost::compute::sin<T>;
495 using cos = boost::compute::cos<T>;
497 using tan = boost::compute::tan<T>;
499 using asin = boost::compute::asin<T>;
501 using acos = boost::compute::acos<T>;
503 using atan = boost::compute::atan<T>;
505 using tanh = boost::compute::tanh<T>;
507 using sqrt = boost::compute::sqrt<T>;
509 using cbrt = boost::compute::cbrt<T>;
511 using abs = boost::compute::fabs<T>;
514 using erf = boost::compute::erf<T>;
516 using erfc = boost::compute::erfc<T>;
520 typedef T result_type;
523 gpu::detail::invoked_sqr<Arg1,T> operator()(
const Arg1 &x)
const{
529 typedef T result_type;
532 gpu::detail::invoked_soft_plus<Arg1,T> operator()(
const Arg1 &x)
const{
538 typedef T result_type;
541 gpu::detail::invoked_sigmoid<Arg1,T> operator()(
const Arg1 &x)
const{
547 typedef T result_type;
550 gpu::detail::invoked_inv<Arg1,T> operator()(
const Arg1 &x)
const{
557 using min = boost::compute::fmin<T>;
559 using max = boost::compute::fmax<T>;
563 using less = boost::compute::less<T>;
565 using less_equal = boost::compute::less_equal<T>;
567 using greater = boost::compute::greater<T>;
569 using greater_equal = boost::compute::greater_equal<T>;
571 using equal = boost::compute::equal_to<T>;
573 using not_equal = boost::compute::not_equal_to<T>;
576namespace gpu{
namespace detail{
581template<
class Entity>
582struct register_with_compute_kernel{
584 static type
const& reg(meta_kernel&, Entity
const& e){
589struct meta_kernel:
public boost::compute::detail::meta_kernel{
590 meta_kernel(std::string
const& name):
boost::compute::detail::meta_kernel(name), m_id(0){}
593 std::string register_kernel_arg(T
const& value){
595 std::string name =
"rem_var"+std::to_string(m_id);
596 this->add_set_arg<T>(name,value);
600 template<
class Entity>
601 typename register_with_compute_kernel<Entity>::type
602 register_args(Entity
const& e){
603 return register_with_compute_kernel<Entity>::reg(*
this,e);
609template<
class F,
class Arg2>
610struct register_with_compute_kernel<device_traits<gpu_tag>::template bind_second<F,Arg2> >{
611 typedef typename register_with_compute_kernel<F>::type f_type;
612 typedef device_traits<gpu_tag>::template bind_second<f_type,std::string> type;
615 device_traits<gpu_tag>::template bind_second<F,Arg2>
const& f
617 std::string arg2_name = k.register_kernel_arg(f.m_arg2);
618 return type(register_with_compute_kernel<F>::reg(k,f.m_function),arg2_name);
622template<
class F,
class G>
623struct register_with_compute_kernel<device_traits<gpu_tag>::template compose<F, G> >{
624 typedef typename register_with_compute_kernel<F>::type f_type;
625 typedef typename register_with_compute_kernel<G>::type g_type;
626 typedef typename device_traits<gpu_tag>::template compose<f_type, g_type> type;
629 device_traits<gpu_tag>::compose<F, G>
const& composed
631 auto f_reg = register_with_compute_kernel<F>::reg(k,composed.m_f);
632 auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
633 return type(f_reg, g_reg);
637template<
class F1,
class F2,
class G>
638struct register_with_compute_kernel<device_traits<gpu_tag>::template compose_binary<F1, F2, G> >{
639 typedef typename register_with_compute_kernel<F1>::type f1_type;
640 typedef typename register_with_compute_kernel<F2>::type f2_type;
641 typedef typename register_with_compute_kernel<G>::type g_type;
642 typedef typename device_traits<gpu_tag>::template compose_binary<f1_type, f2_type, g_type> type;
645 device_traits<gpu_tag>::compose_binary<F1, F2, G>
const& composed
647 auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
648 auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
649 auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
650 return type(f1_reg, f2_reg, g_reg);
655template<
class F1,
class F2,
class G>
656struct register_with_compute_kernel<device_traits<gpu_tag>::template transform_arguments<F1, F2, G> >{
657 typedef typename register_with_compute_kernel<F1>::type f1_type;
658 typedef typename register_with_compute_kernel<F2>::type f2_type;
659 typedef typename register_with_compute_kernel<G>::type g_type;
660 typedef typename device_traits<gpu_tag>::template transform_arguments<f1_type, f2_type, g_type> type;
663 device_traits<gpu_tag>::transform_arguments<F1, F2, G>
const& composed
665 auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
666 auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
667 auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
668 return type(f1_reg, f2_reg, g_reg);
673struct register_with_compute_kernel<device_traits<gpu_tag>::template constant<T,T> >{
674 typedef typename device_traits<gpu_tag>::template constant<T,std::string> type;
677 device_traits<gpu_tag>::constant<T,T>
const& f
679 return type(k.register_kernel_arg(f.m_value));
684struct register_with_compute_kernel<device_traits<gpu_tag>::template safe_divide<T,T> >{
685 typedef typename device_traits<gpu_tag>::template safe_divide<T,std::string> type;
688 device_traits<gpu_tag>::safe_divide<T,T>
const& f
690 return type(k.register_kernel_arg(f.default_value));
695struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_and_add<T,T> >{
696 typedef typename device_traits<gpu_tag>::template multiply_and_add<T,std::string> type;
699 device_traits<gpu_tag>::multiply_and_add<T,T>
const& f
701 return type(k.register_kernel_arg(f.m_scalar));
705template<
class T,
char Op>
706struct register_with_compute_kernel<device_traits<gpu_tag>::template operator_scalar<T, Op, T> >{
707 typedef typename device_traits<gpu_tag>::template operator_scalar<T, Op, std::string> type;
710 device_traits<gpu_tag>::operator_scalar<T, Op, T>
const& f
712 return type(k.register_kernel_arg(f.m_scalar));
717struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_assign<T,T> >{
718 typedef typename device_traits<gpu_tag>::template multiply_assign<T,std::string> type;
721 device_traits<gpu_tag>::multiply_assign<T,T>
const& f
723 return type(k.register_kernel_arg(f.m_scalar));
729template<
class Arg,
class T,
class S>
730struct invoked_dense_vector_element{
731 typedef T result_type;
735 boost::compute::buffer buffer;
738template<
class Arg,
class T,
class S>
739boost::compute::detail::meta_kernel& operator<< (
740 boost::compute::detail::meta_kernel& k,
741 invoked_dense_vector_element<Arg, T, S>
const& e
743 return k<< k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
744 <<
" [ "<<e.offset <<
"+("<<e.arg <<
") *"<<e.stride<<
']';
747template<
class T,
class S=std::
size_t>
748struct dense_vector_element{
749 typedef T result_type;
752 gpu::detail::invoked_dense_vector_element<Arg,T, S> operator()(Arg
const& x)
const{
753 return {x, m_stride, m_offset, m_buffer};
755 boost::compute::buffer m_buffer;
761struct register_with_compute_kernel<dense_vector_element<T,std::size_t> >{
762 typedef dense_vector_element<T,std::string> type;
765 dense_vector_element<T,std::size_t>
const& e
767 return {e.m_buffer, k.register_kernel_arg(e.m_stride),k.register_kernel_arg(e.m_offset)};
772template<
class Arg1,
class Arg2,
class T,
class S>
773struct invoked_matrix_element{
774 typedef T result_type;
780 boost::compute::buffer buffer;
784template<
class Arg1,
class Arg2,
class T,
class S>
785boost::compute::detail::meta_kernel& operator<< (
786 boost::compute::detail::meta_kernel& k,
787 invoked_matrix_element<Arg1, Arg2, T, S>
const& e
789 return k << k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
790 <<
'['<<e.offset<<
"+ ("<<e.arg1 <<
") * "<<e.stride1<<
" + ("<<e.arg2 <<
") * "<<e.stride2<<
']';
793template<
class T,
class S=std::
size_t>
794struct dense_matrix_element{
795 typedef T result_type;
797 template<
class Arg1,
class Arg2>
798 gpu::detail::invoked_matrix_element<Arg1, Arg2, T, S> operator()(Arg1
const& x, Arg2
const& y)
const{
799 return {x, y, m_stride1, m_stride2, m_offset, m_buffer};
802 boost::compute::buffer m_buffer;
809struct register_with_compute_kernel<dense_matrix_element<T,std::size_t> >{
810 typedef dense_matrix_element<T,std::string> type;
813 dense_matrix_element<T,std::size_t>
const& e
815 auto const& stride1 = k.register_kernel_arg(e.m_stride1);
816 auto const& stride2 = k.register_kernel_arg(e.m_stride2);
817 auto const& offset = k.register_kernel_arg(e.m_offset);
818 return {e.m_buffer, stride1, stride2, offset};