include/shark/LinAlg/BLAS/kernels/default/dense

Go to the documentation of this file.
/*!
 *
 *
 * \brief       dense matrix matrix multiplication implementation
 *
 * \author      O. Krause
 * \date        2016
 *
 *
 * \par Copyright 1995-2015 Shark Development Team
 *
 * <BR><HR>
 * This file is part of Shark.
 * <http://image.diku.dk/shark/>
 *
 * Shark is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Shark is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
 
#ifndef REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
#define REMORA_KERNELS_DEFAULT_DENSE_GEMM_HPP
 
#include "../gemv.hpp"//for dispatching to gemv
#include "../../assignment.hpp"//plus_assign
#include "../../proxy_expressions.hpp"//matrix row,column,transpose,range
#include "mgemm.hpp" //block macro kernel for dense gemm
#include <type_traits> //std::common_type
 
 
namespace remora{namespace bindings {
 
//  Dense Block-GEMM implementation based on boost.ublas
//  written by:
//  Copyright (c) 2016
//  Michael Lehn, Imre Palik
//
//  Distributed under the Boost Software License, Version 1.0. (See
//  accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)
 
template <typename T>
struct gemm_block_size {
    typedef detail::block<T> block;
    static const unsigned mr = 4; // stripe width for lhs
    static const unsigned nr = 3 * block::max_vector_elements; // stripe width for rhs
    static const unsigned mc = 128;
    static const unsigned kc = 512; // stripe length
    static const unsigned nc = (1024/nr) * nr;
};
 
template <>
struct gemm_block_size<float> {
    typedef detail::block<float> block;
    static const unsigned mc = 256;
    static const unsigned kc = 512; // stripe length
    static const unsigned nc = 4096;
    static const unsigned mr = 4; // stripe width for lhs
    static const unsigned nr = 16; // stripe width for rhs
};
 
template <>
struct gemm_block_size<long double> {
    typedef detail::block<long double> block;
    static const unsigned mc = 256;
    static const unsigned kc = 512; // stripe length
    static const unsigned nc = 4096;
    static const unsigned mr = 1; // stripe width for lhs
    static const unsigned nr = 4; // stripe width for rhs
};
 
//-- Dense gemm
template <class E1, class E2, class Mat>
void dense_gemm(
    matrix_expression<E1, cpu_tag> const& e1,
    matrix_expression<E2, cpu_tag> const& e2,
    matrix_expression<Mat, cpu_tag>& m,
    typename Mat::value_type alpha
){
    static_assert(std::is_same<typename Mat::orientation,row_major>::value,"target matrix must be row major");
    typedef typename std::common_type<
        typename E1::value_type, typename E2::value_type, typename Mat::value_type
    >::type value_type;
 
    typedef gemm_block_size<
        typename std::common_type<typename E1::value_type, typename E2::value_type>::type
    > block_size;
 
    static const std::size_t MC = block_size::mc;
    static const std::size_t NC = block_size::nc;
    static const std::size_t KC = block_size::kc;
 
    //obtain uninitialized aligned storage
    boost::alignment::aligned_allocator<value_type,block_size::block::align> allocator;
    value_type* A = allocator.allocate(MC * KC);
    value_type* B = allocator.allocate(NC * KC);
 
    const std::size_t M = m().size1();
    const std::size_t N = m().size2();
    const std::size_t K = e1().size2 ();
    const std::size_t mb = (M+MC-1) / MC;
    const std::size_t nb = (N+NC-1) / NC;
    const std::size_t kb = (K+KC-1) / KC;
 
    auto storageM = m().raw_storage();
    auto C_ = storageM.values;
    const std::size_t ldc = storageM.leading_dimension;
    for (std::size_t j=0; j<nb; ++j) {
        std::size_t nc = std::min(NC, N - j*NC);
 
        for (std::size_t l=0; l<kb; ++l) {
            std::size_t kc = std::min(KC, K - l*KC);
            auto Bs = subrange(e2, l*KC, l*KC+kc, j*NC, j*NC+nc);
            pack_B_dense(Bs, B, block_size());
 
            for (std::size_t i=0; i<mb; ++i) {
                std::size_t mc = std::min(MC, M - i*MC);
                auto As = subrange(e1, i*MC, i*MC+mc, l*KC, l*KC+kc);
                pack_A_dense(As, A, block_size());
 
                mgemm(
                    mc, nc, kc, alpha, A, B,
                    &C_[i*MC*ldc+j*NC], ldc , 1, block_size()
                );
            }
        }
    }
    //free storage
    allocator.deallocate(A,MC * KC);
    allocator.deallocate(B,NC * KC);
}
 
}}
#endif