33#ifndef REMORA_GPU_TRAITS_HPP 
   34#define REMORA_GPU_TRAITS_HPP 
   36#include <boost/compute/command_queue.hpp> 
   37#include <boost/compute/core.hpp> 
   38#include <boost/compute/container/vector.hpp> 
   39#include <boost/compute/functional/operator.hpp> 
   40#include <boost/compute/functional.hpp> 
   42namespace remora{
namespace gpu{
 
   44template<
class T, 
class Tag>
 
   45struct dense_vector_storage{
 
   46    typedef Tag storage_tag;
 
   48    boost::compute::buffer buffer;
 
   52    dense_vector_storage(){}
 
   53    dense_vector_storage(boost::compute::buffer 
const& buffer, std::size_t offset, std::size_t stride)
 
   54    :buffer(buffer), offset(offset), stride(stride){}
 
   55    template<
class U, 
class Tag2>
 
   56    dense_vector_storage(dense_vector_storage<U, Tag2> 
const& storage):
 
   57    buffer(storage.buffer), offset(storage.offset), stride(storage.stride){
 
   58        static_assert(std::is_convertible<U&, T&>::value, 
"incompatible storage");
 
   59        static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value), 
"Trying to assign dense to continuous dense storage");
 
   62    dense_vector_storage<T,Tag> sub_region(std::size_t offset)
 const{
 
   63        return {buffer, this->offset+offset * stride, stride};
 
   67template<
class T, 
class Tag>
 
   68struct dense_matrix_storage{
 
   69    typedef Tag storage_tag;
 
   71    struct row_storage: 
public std::conditional<
 
   72        std::is_same<O,row_major>::value,
 
   73        dense_vector_storage<T, Tag>,
 
   74        dense_vector_storage<T, dense_tag>
 
   77    struct rows_storage: 
public std::conditional<
 
   78        std::is_same<O,row_major>::value,
 
   79        dense_matrix_storage<T, Tag>,
 
   80        dense_matrix_storage<T, dense_tag>
 
   83    typedef dense_vector_storage<T,Tag> diag_storage;
 
   84    typedef dense_matrix_storage<T,dense_tag> sub_region_storage;
 
   86    boost::compute::buffer buffer;
 
   88    std::size_t leading_dimension;
 
   90    dense_matrix_storage(){}
 
   91    dense_matrix_storage(boost::compute::buffer 
const& buffer, std::size_t offset, std::size_t leading_dimension)
 
   92    :buffer(buffer), offset(offset), leading_dimension(leading_dimension){}
 
   94    template<
class U, 
class Tag2>
 
   95    dense_matrix_storage(dense_matrix_storage<U, Tag2> 
const& storage):
 
   96    buffer(storage.buffer), offset(storage.offset), leading_dimension(storage.leading_dimension){
 
   97        static_assert(std::is_convertible<U&, T&>::value, 
"incompatible storage");
 
   98        static_assert(!(std::is_same<Tag,continuous_dense_tag>::value && std::is_same<Tag2,dense_tag>::value), 
"Trying to assign dense to continuous dense storage");
 
  101    template<
class Orientation>
 
  102    sub_region_storage sub_region(std::size_t offset1, std::size_t offset2, Orientation)
 const{
 
  103        std::size_t offset_major = Orientation::index_M(offset1,offset2);
 
  104        std::size_t offset_minor = Orientation::index_m(offset1,offset2);
 
  105        return {buffer, offset + offset_major*leading_dimension+offset_minor, leading_dimension};
 
  108    template<
class Orientation>
 
  109    typename row_storage<Orientation>::type row(std::size_t i, Orientation)
 const{
 
  110        return {buffer, offset + i * Orientation::index_M(leading_dimension,std::size_t(1)), Orientation::index_m(leading_dimension,std::size_t(1))};
 
  113    template<
class Orientation>
 
  114    typename rows_storage<Orientation>::type sub_rows(std::size_t i, Orientation)
 const{
 
  115        std::size_t stride = Orientation::index_M(leading_dimension,(std::size_t)1);
 
  116        return {buffer,offset + i * stride, leading_dimension};
 
  120        return {buffer, offset, leading_dimension+1};
 
  123    dense_vector_storage<T, continuous_dense_tag> linear()
 const{
 
  124        return {buffer, offset, 1};
 
  134template<
class T, 
class Stored = T>
 
  135struct invoked_constant{
 
  139template<
class Arg1, 
class T, 
char Op, 
class Stored>
 
  140struct invoked_operator_scalar{
 
  141    typedef T result_type;
 
  146template<
class Arg1, 
class T, 
class Stored = T>
 
  147struct invoked_add_scalar{
 
  148    typedef T result_type;
 
  153template<
class Arg1, 
class Arg2, 
class T, 
class Stored=T>
 
  154struct invoked_multiply_and_add{
 
  155    typedef T result_type;
 
  161template<
class Arg1, 
class T>
 
  162struct invoked_soft_plus{
 
  163    typedef T result_type;
 
  166template<
class Arg1, 
class T>
 
  167struct invoked_sigmoid{
 
  168    typedef T result_type;
 
  172template<
class Arg1, 
class T>
 
  174    typedef T result_type;
 
  178template<
class Arg1, 
class T>
 
  180    typedef T result_type;
 
  184template<
class Arg1, 
class Arg2, 
class T, 
class S>
 
  185struct invoked_safe_div{
 
  186    typedef T result_type;
 
  193template<
class Arg1, 
class T, 
char Op, 
class S>
 
  194boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_operator_scalar<Arg1,T, Op, S> 
const& e){
 
  195    return k << 
'('<<e.arg1 << Op << e.m_scalar<<
')';
 
  197template<
class Arg1, 
class Arg2, 
class T, 
class S>
 
  198boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_multiply_and_add<Arg1,Arg2,T, S> 
const& e){
 
  199    return k << 
'('<<e.arg1<<
'+'<<e.m_scalar << 
'*'<< e.arg2<<
')';
 
  201template<
class Arg1, 
class T>
 
  202boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_soft_plus<Arg1,T> 
const& e){
 
  203    return k << 
"(log(1+exp("<< e.arg1<<
")))";
 
  205template<
class Arg1, 
class T>
 
  206boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_sigmoid<Arg1,T> 
const& e){
 
  207    return k << 
"(1/(1+exp(-"<< e.arg1<<
")))";
 
  209template<
class Arg1, 
class T>
 
  210boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_sqr<Arg1,T> 
const& e){
 
  211    return k << 
'('<<e.arg1<<
'*'<<e.arg1<<
')';
 
  213template<
class Arg1, 
class T>
 
  214boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_inv<Arg1,T> 
const& e){
 
  215    return k << 
"1/("<<e.arg1<<
')';
 
  218template<
class T, 
class S>
 
  219boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_constant<T, S> 
const& e){
 
  220    return k << e.m_value;
 
  224template<
class Arg1, 
class Arg2, 
class T, 
class S>
 
  225boost::compute::detail::meta_kernel& 
operator<<(boost::compute::detail::meta_kernel& k, invoked_safe_div<Arg1,Arg2,T, S> 
const& e){
 
  226    return k << 
"(("<<e.arg2<<
"!=0)?"<<e.arg1<<
'/'<<e.arg2<<
':'<<e.default_value<<
')';
 
  233struct device_traits<gpu_tag>{
 
  234    typedef boost::compute::command_queue queue_type;
 
  236    static queue_type& default_queue(){
 
  237        return boost::compute::system::default_queue();
 
  242    template <
class Iterator, 
class Functor>
 
  243    struct transform_iterator{
 
  244        typedef no_iterator type;
 
  247    template <
class Iterator1, 
class Iterator2, 
class Functor>
 
  248    struct binary_transform_iterator{
 
  249        typedef no_iterator type;
 
  253    struct constant_iterator{
 
  254        typedef no_iterator type;
 
  258    struct one_hot_iterator{
 
  259        typedef no_iterator type;
 
  262    template<
class Closure>
 
  263    struct indexed_iterator{
 
  264        typedef no_iterator type;
 
  270    template<
class F, 
class G>
 
  272        typedef typename G::result_type result_type;
 
  273        compose(F 
const& f, G 
const& g): m_f(f), m_g(g){ }
 
  276        auto operator()( Arg1 
const& x) 
const -> 
decltype(std::declval<G const&>()(std::declval<F const&>()(x))){
 
  279        template<
class Arg1, 
class Arg2>
 
  280        auto operator()( Arg1 
const& x, Arg2 
const& y) 
const -> 
decltype(std::declval<G const&>()(std::declval<F const&>()(x,y))){
 
  281            return m_g(m_f(x,y));
 
  289    template<
class F1, 
class F2, 
class G>
 
  290    struct compose_binary{
 
  291        typedef typename G::result_type result_type;
 
  292        compose_binary(F1 
const& f1, F2 
const& f2, G 
const& g): m_f1(f1), m_f2(f2), m_g(g){ }
 
  295        auto operator()( Arg1 
const& x) 
const -> 
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(x))){
 
  296            return m_g(m_f1(x), m_f2(x));
 
  298        template<
class Arg1, 
class Arg2>
 
  299        auto operator()( Arg1 
const& x, Arg2 
const& y) 
const -> 
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x,y),std::declval<F2 const&>()(x,y))){
 
  300            return m_g(m_f1(x,y), m_f2(x,y));
 
  310    template<
class F1, 
class F2, 
class G>
 
  311    struct transform_arguments{
 
  312        typedef typename G::result_type result_type;
 
  313        transform_arguments(F1 
const& f1, F2 
const& f2, G 
const& g): m_f1(f1), m_f2(f2), m_g(g){ }
 
  315        template<
class Arg1, 
class Arg2>
 
  316        auto operator()( Arg1 
const& x, Arg2 
const& y) 
const -> 
decltype(std::declval<G const&>()(std::declval<F1 const&>()(x),std::declval<F2 const&>()(y))){
 
  317            return m_g(m_f1(x),m_f2(y));
 
  325    template<
class F, 
class Arg2>
 
  327        typedef typename F::result_type result_type;
 
  328        bind_second(F 
const& f, Arg2 
const& arg2) : m_function(f), m_arg2(arg2){ }
 
  331        auto operator()(Arg1 
const& arg1) 
const -> 
decltype(std::declval<F const&>()(arg1,std::declval<Arg2 const&>()))
 
  333            return m_function(arg1, m_arg2);
 
  342    template<
class F, 
class G>
 
  343    static compose<F,G> make_compose(F 
const& f, G 
const&g){
 
  344        return compose<F,G>(f,g);
 
  347    template<
class F1, 
class F2, 
class G>
 
  348    static compose_binary<F1, F2, G> make_compose_binary(F1 
const& f1, F2 
const& f2, G 
const&g){
 
  349        return compose_binary<F1, F2, G>(f1, f2, g);
 
  352    template<
class F1, 
class F2, 
class G>
 
  353    static transform_arguments<F1, F2, G> make_transform_arguments(F1 
const& f1, F2 
const& f2, G 
const& g){
 
  354        return transform_arguments<F1, F2, G>(f1, f2, g);
 
  357    template<
class F, 
class Arg2>
 
  358    static bind_second<F,Arg2> make_bind_second(F 
const& f, Arg2 
const& arg2){
 
  359        return bind_second<F,Arg2>(f,arg2);
 
  367    using add = boost::compute::plus<T>;
 
  369    using subtract = boost::compute::minus<T>;
 
  371    using multiply = boost::compute::multiplies<T>;
 
  373    using divide = boost::compute::divides<T>;
 
  375    using modulo = boost::compute::modulus<T>;
 
  377    using pow = boost::compute::pow<T>;
 
  378    template<
class T, 
class S=T>
 
  380        typedef T result_type;
 
  381        safe_divide(
S const& default_value) : default_value(default_value) { }
 
  383        template<
class Arg1, 
class Arg2>
 
  384        gpu::detail::invoked_safe_div<Arg1,Arg2, T,S> operator()(
const Arg1 &x, 
const Arg2& y)
 const 
  386            return {x,y,default_value};
 
  390    template<
class T, 
class S= T>
 
  391    struct multiply_and_add{
 
  392        typedef T result_type;
 
  393        multiply_and_add(
S const& scalar) :m_scalar(scalar) { }
 
  395        template<
class Arg1, 
class Arg2>
 
  396        gpu::detail::invoked_multiply_and_add<Arg1,Arg2,T,S> operator()(
const Arg1 &x, 
const Arg2& y)
 const 
  398            return {x,y, m_scalar};
 
  404    template<
class T, 
char Op, 
class S>
 
  405    struct operator_scalar{
 
  406        typedef T result_type;
 
  407        operator_scalar(
S const& scalar) : m_scalar(scalar) { }
 
  410        gpu::detail::invoked_operator_scalar<Arg1,T, Op, S> operator()(Arg1 
const& x)
 const 
  412            return {x, m_scalar};
 
  418    using multiply_scalar = operator_scalar<T, 
'*', T>;
 
  420    using add_scalar = operator_scalar<T, 
'+', T>;
 
  422    using divide_scalar = operator_scalar<T, 
'/', T>;
 
  424    using modulo_scalar = operator_scalar<T, 
'%', T>;
 
  426    template<
class T, 
class S=T>
 
  427    struct multiply_assign{
 
  428        typedef T result_type;
 
  429        multiply_assign(
S const& scalar): m_scalar(scalar) { }
 
  431        template<
class Arg1, 
class Arg2>
 
  432        gpu::detail::invoked_operator_scalar<Arg2,T,
'*',
S> operator()(
const Arg1&, 
const Arg2& y)
 const 
  434            return {y, m_scalar};
 
  440        typedef T result_type;
 
  443        Arg 
const& operator()(Arg 
const& arg)
 const{
 
  450        typedef T result_type;
 
  452        template<
class Arg1, 
class Arg2>
 
  453        Arg1 
const& operator()(Arg1 
const& arg1, Arg2 
const&)
 const{
 
  459        typedef T result_type;
 
  461        template<
class Arg1, 
class Arg2>
 
  462        Arg2 
const& operator()(Arg1 
const&, Arg2 
const& arg2)
 const{
 
  467    template<
class T, 
class S=T>
 
  469        typedef T result_type;
 
  470        constant(
S const& value): m_value(value){}
 
  473        gpu::detail::invoked_constant<T,S> operator()(Arg 
const&)
 const 
  477        template<
class Arg1, 
class Arg2>
 
  478        gpu::detail::invoked_constant<T,S> operator()(Arg1 
const&, Arg2 
const&)
 const 
  489    using log = boost::compute::log<T>;
 
  491    using exp = boost::compute::exp<T>;
 
  493    using sin = boost::compute::sin<T>;
 
  495    using cos = boost::compute::cos<T>;
 
  497    using tan = boost::compute::tan<T>;
 
  499    using asin = boost::compute::asin<T>;
 
  501    using acos = boost::compute::acos<T>;
 
  503    using atan = boost::compute::atan<T>;
 
  505    using tanh = boost::compute::tanh<T>;
 
  507    using sqrt = boost::compute::sqrt<T>;
 
  509    using cbrt = boost::compute::cbrt<T>;
 
  511    using abs = boost::compute::fabs<T>;
 
  514    using erf = boost::compute::erf<T>;
 
  516    using erfc = boost::compute::erfc<T>;
 
  520        typedef T result_type;
 
  523        gpu::detail::invoked_sqr<Arg1,T> operator()(
const Arg1 &x)
 const{
 
  529        typedef T result_type;
 
  532        gpu::detail::invoked_soft_plus<Arg1,T> operator()(
const Arg1 &x)
 const{
 
  538        typedef T result_type;
 
  541        gpu::detail::invoked_sigmoid<Arg1,T> operator()(
const Arg1 &x)
 const{
 
  547        typedef T result_type;
 
  550        gpu::detail::invoked_inv<Arg1,T> operator()(
const Arg1 &x)
 const{
 
  557    using min = boost::compute::fmin<T>;
 
  559    using max = boost::compute::fmax<T>;
 
  563    using less = boost::compute::less<T>;
 
  565    using less_equal  = boost::compute::less_equal<T>;
 
  567    using greater = boost::compute::greater<T>;
 
  569    using greater_equal  = boost::compute::greater_equal<T>;
 
  571    using equal = boost::compute::equal_to<T>;
 
  573    using not_equal  = boost::compute::not_equal_to<T>;
 
  576namespace gpu{
namespace detail{
 
  581template<
class Entity>
 
  582struct register_with_compute_kernel{
 
  584    static type 
const& reg(meta_kernel&, Entity 
const& e){
 
  589struct meta_kernel: 
public boost::compute::detail::meta_kernel{
 
  590    meta_kernel(std::string 
const& name):
boost::compute::detail::meta_kernel(name), m_id(0){}
 
  593    std::string register_kernel_arg(T 
const& value){
 
  595        std::string name = 
"rem_var"+std::to_string(m_id);
 
  596        this->add_set_arg<T>(name,value);
 
  600    template<
class Entity>
 
  601    typename register_with_compute_kernel<Entity>::type
 
  602    register_args(Entity 
const& e){
 
  603        return register_with_compute_kernel<Entity>::reg(*
this,e);
 
  609template<
class F, 
class Arg2>
 
  610struct register_with_compute_kernel<device_traits<gpu_tag>::template bind_second<F,Arg2> >{
 
  611    typedef typename register_with_compute_kernel<F>::type f_type;
 
  612    typedef device_traits<gpu_tag>::template bind_second<f_type,std::string> type;
 
  615        device_traits<gpu_tag>::template bind_second<F,Arg2> 
const& f
 
  617        std::string arg2_name = k.register_kernel_arg(f.m_arg2);
 
  618        return type(register_with_compute_kernel<F>::reg(k,f.m_function),arg2_name);
 
  622template<
class F, 
class G>
 
  623struct register_with_compute_kernel<device_traits<gpu_tag>::template compose<F, G> >{
 
  624    typedef typename register_with_compute_kernel<F>::type f_type;
 
  625    typedef typename register_with_compute_kernel<G>::type g_type;
 
  626    typedef typename device_traits<gpu_tag>::template compose<f_type, g_type> type;
 
  629        device_traits<gpu_tag>::compose<F, G> 
const& composed
 
  631        auto f_reg = register_with_compute_kernel<F>::reg(k,composed.m_f);
 
  632        auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
 
  633        return type(f_reg, g_reg);
 
  637template<
class F1, 
class F2, 
class G>
 
  638struct register_with_compute_kernel<device_traits<gpu_tag>::template compose_binary<F1, F2, G> >{
 
  639    typedef typename register_with_compute_kernel<F1>::type f1_type;
 
  640    typedef typename register_with_compute_kernel<F2>::type f2_type;
 
  641    typedef typename register_with_compute_kernel<G>::type g_type;
 
  642    typedef typename device_traits<gpu_tag>::template compose_binary<f1_type, f2_type, g_type> type;
 
  645        device_traits<gpu_tag>::compose_binary<F1, F2, G> 
const& composed
 
  647        auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
 
  648        auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
 
  649        auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
 
  650        return type(f1_reg, f2_reg, g_reg);
 
  655template<
class F1, 
class F2, 
class G>
 
  656struct register_with_compute_kernel<device_traits<gpu_tag>::template transform_arguments<F1, F2, G> >{
 
  657    typedef typename register_with_compute_kernel<F1>::type f1_type;
 
  658    typedef typename register_with_compute_kernel<F2>::type f2_type;
 
  659    typedef typename register_with_compute_kernel<G>::type g_type;
 
  660    typedef typename device_traits<gpu_tag>::template transform_arguments<f1_type, f2_type, g_type> type;
 
  663        device_traits<gpu_tag>::transform_arguments<F1, F2, G> 
const& composed
 
  665        auto f1_reg = register_with_compute_kernel<F1>::reg(k,composed.m_f1);
 
  666        auto f2_reg = register_with_compute_kernel<F2>::reg(k,composed.m_f2);
 
  667        auto g_reg = register_with_compute_kernel<G>::reg(k,composed.m_g);
 
  668        return type(f1_reg, f2_reg, g_reg);
 
  673struct register_with_compute_kernel<device_traits<gpu_tag>::template constant<T,T> >{
 
  674    typedef typename device_traits<gpu_tag>::template constant<T,std::string> type;
 
  677        device_traits<gpu_tag>::constant<T,T> 
const& f
 
  679        return type(k.register_kernel_arg(f.m_value));
 
  684struct register_with_compute_kernel<device_traits<gpu_tag>::template safe_divide<T,T> >{
 
  685    typedef typename device_traits<gpu_tag>::template safe_divide<T,std::string> type;
 
  688        device_traits<gpu_tag>::safe_divide<T,T> 
const& f
 
  690        return type(k.register_kernel_arg(f.default_value));
 
  695struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_and_add<T,T> >{
 
  696    typedef typename device_traits<gpu_tag>::template multiply_and_add<T,std::string> type;
 
  699        device_traits<gpu_tag>::multiply_and_add<T,T> 
const& f
 
  701        return type(k.register_kernel_arg(f.m_scalar));
 
  705template<
class T, 
char Op>
 
  706struct register_with_compute_kernel<device_traits<gpu_tag>::template operator_scalar<T, Op, T> >{
 
  707    typedef typename device_traits<gpu_tag>::template operator_scalar<T, Op, std::string> type;
 
  710        device_traits<gpu_tag>::operator_scalar<T, Op, T> 
const& f
 
  712        return type(k.register_kernel_arg(f.m_scalar));
 
  717struct register_with_compute_kernel<device_traits<gpu_tag>::template multiply_assign<T,T> >{
 
  718    typedef typename device_traits<gpu_tag>::template multiply_assign<T,std::string> type;
 
  721        device_traits<gpu_tag>::multiply_assign<T,T> 
const& f
 
  723        return type(k.register_kernel_arg(f.m_scalar));
 
  729template<
class Arg, 
class T, 
class S>
 
  730struct invoked_dense_vector_element{
 
  731    typedef T result_type;
 
  735    boost::compute::buffer buffer;
 
  738template<
class Arg,
class T, 
class S>
 
  739boost::compute::detail::meta_kernel& operator<< (
 
  740    boost::compute::detail::meta_kernel& k, 
 
  741    invoked_dense_vector_element<Arg, T, S> 
const& e
 
  743    return k<< k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
 
  744        <<
" [ "<<e.offset <<
"+("<<e.arg <<
") *"<<e.stride<<
']';
 
  747template<
class T, 
class S=std::
size_t>
 
  748struct dense_vector_element{
 
  749    typedef T result_type;
 
  752    gpu::detail::invoked_dense_vector_element<Arg,T, S> operator()(Arg 
const& x)
 const{
 
  753        return {x, m_stride, m_offset, m_buffer};
 
  755    boost::compute::buffer m_buffer;
 
  761struct register_with_compute_kernel<dense_vector_element<T,std::size_t> >{
 
  762    typedef dense_vector_element<T,std::string> type;
 
  765        dense_vector_element<T,std::size_t> 
const& e
 
  767        return {e.m_buffer, k.register_kernel_arg(e.m_stride),k.register_kernel_arg(e.m_offset)};
 
  772template<
class Arg1, 
class Arg2,  
class T, 
class S>
 
  773struct invoked_matrix_element{
 
  774    typedef T result_type;
 
  780    boost::compute::buffer buffer;
 
  784template<
class Arg1, 
class Arg2, 
class T, 
class S>
 
  785boost::compute::detail::meta_kernel& operator<< (
 
  786    boost::compute::detail::meta_kernel& k, 
 
  787    invoked_matrix_element<Arg1, Arg2, T, S> 
const& e
 
  789    return k << k.get_buffer_identifier<T>(e.buffer, boost::compute::memory_object::global_memory)
 
  790                 <<
'['<<e.offset<<
"+ ("<<e.arg1 <<
") * "<<e.stride1<<
" + ("<<e.arg2 <<
") * "<<e.stride2<<
']';
 
  793template<
class T, 
class S=std::
size_t>
 
  794struct dense_matrix_element{
 
  795    typedef T result_type;
 
  797    template<
class Arg1, 
class Arg2>
 
  798    gpu::detail::invoked_matrix_element<Arg1, Arg2, T, S> operator()(Arg1 
const& x, Arg2 
const& y)
 const{
 
  799        return {x, y, m_stride1, m_stride2, m_offset, m_buffer};
 
  802    boost::compute::buffer m_buffer;
 
  809struct register_with_compute_kernel<dense_matrix_element<T,std::size_t> >{
 
  810    typedef dense_matrix_element<T,std::string> type;
 
  813        dense_matrix_element<T,std::size_t> 
const& e
 
  815        auto const& stride1 = k.register_kernel_arg(e.m_stride1); 
 
  816        auto const& stride2 = k.register_kernel_arg(e.m_stride2); 
 
  817        auto const& offset = k.register_kernel_arg(e.m_offset); 
 
  818        return {e.m_buffer, stride1, stride2, offset};