$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: [ublas] [RFC PATCH] ublas: improved dense matrix multiplication performance
From: Imre Palik (imre_palik_at_[hidden])
Date: 2016-02-13 08:43:35
This patch includes the gemm implementation from Michael Lehn to
boost::ublas.
This modifies the workings of ublas::prod() and ublas::axppy_prod()
to use gemm() above a certain matrix size.
This patch only contains the basic architecture, and a generic c++
implementation. All the architecture, or compiler specific stuff
will go to follow-up patches.
Signed-off-by: Imre Palik <imre_palik_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>
---
include/boost/numeric/ublas/detail/gemm.hpp | 279 ++++++++++++++++++++++
include/boost/numeric/ublas/matrix_expression.hpp | 59 ++++-
include/boost/numeric/ublas/operation.hpp | 87 ++++++-
3 files changed, 407 insertions(+), 18 deletions(-)
create mode 100644 include/boost/numeric/ublas/detail/gemm.hpp
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
new file mode 100644
index 0000000..cb4a343
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -0,0 +1,279 @@
+//
+// Copyright (c) 2016
+// Michael Lehn, Imre Palik
+//
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_GEMM_
+#define _BOOST_UBLAS_GEMM_
+
+#include <boost/type_traits/common_type.hpp>
+#include <boost/align/aligned_alloc.hpp>
+#include <boost/align/assume_aligned.hpp>
+#include <boost/static_assert.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+ template <typename T>
+ struct prod_block_size {
+ static const unsigned mc = 256;
+ static const unsigned kc = 512; // stripe length
+ static const unsigned nc = 4092;
+ static const unsigned mr = 4; // stripe width for lhs
+ static const unsigned nr = 12; // stripe width for rhs
+ static const unsigned align = 64; // align temporary arrays to this boundary
+ static const unsigned limit = 26; // Use gemm from this size
+ BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+ BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+ BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+ };
+
+ template <typename E>
+ void
+ gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
+ {
+ typedef typename E::size_type size_type;
+
+ for (size_type i=0; i<X().size1(); ++i) {
+ for (size_type j=0; j<X().size2(); ++j) {
+ X()(i,j) *= alpha;
+ }
+ }
+ }
+
+ template <typename Index, typename T>
+ void
+ geaxpy(Index m, Index n, const T &alpha,
+ const T *X, Index incRowX, Index incColX,
+ T *Y, Index incRowY, Index incColY)
+ {
+ for (Index j=0; j<n; ++j) {
+ for (Index i=0; i<m; ++i) {
+ Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
+ }
+ }
+ }
+
+ template <typename Index, typename TX>
+ void
+ gescal(Index m, Index n,
+ const TX &alpha,
+ TX *X, Index incRowX, Index incColX)
+ {
+ for (Index j=0; j<n; ++j) {
+ for (Index i=0; i<m; ++i) {
+ X[i*incRowX+j*incColX] *= alpha;
+ }
+ }
+ }
+
+ //-- Micro Kernel --------------------------------------------------------------
+ template <typename Index, typename T, typename TC, typename BlockSize>
+ void
+ ugemm(Index kc, TC alpha, const T *A, const T *B,
+ TC beta, TC *C, Index incRowC, Index incColC)
+ {
+ BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align);
+ BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
+ static const Index MR = BlockSize::mr;
+ static const Index NR = BlockSize::nr;
+ typename boost::aligned_storage<sizeof(T[MR*NR]),alignof(BlockSize::align)>::type Pa;
+ T *P = reinterpret_cast<T*>(Pa.address());
+ for (unsigned c = 0; c < MR * NR; c++)
+ P[c] = 0;
+
+ for (Index l=0; l<kc; ++l) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ P[i* NR+j] += A[i]*B[j];
+ }
+ }
+ A += MR;
+ B += NR;
+ }
+
+ if (alpha!=TC(1)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ P[i*NR+j] *= alpha;
+ }
+ }
+ }
+
+ if (beta == TC(0)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ C[i*incRowC+j*incColC] = P[i*NR+j];
+ }
+ }
+ } else {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ C[i*incRowC+j*incColC] *= beta;
+ C[i*incRowC+j*incColC] += P[i*NR+j];
+ }
+ }
+ }
+ }
+
+ //-- Macro Kernel --------------------------------------------------------------
+ template <typename Index, typename T, typename TC, typename BlockSize>
+ void
+ mgemm(Index mc, Index nc, Index kc, TC alpha,
+ const T *A, const T *B, TC beta,
+ TC *C, Index incRowC, Index incColC)
+ {
+ static const Index MR = BlockSize::mr;
+ static const Index NR = BlockSize::nr;
+ const Index mp = (mc+MR-1) / MR;
+ const Index np = (nc+NR-1) / NR;
+ const Index mr_ = mc % MR;
+ const Index nr_ = nc % NR;
+
+ for (Index j=0; j<np; ++j) {
+ const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
+ T C_[MR*NR];
+
+ for (Index i=0; i<mp; ++i) {
+ const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+ if (mr==MR && nr==NR) {
+ ugemm<Index, T, TC, BlockSize>(kc, alpha,
+ &A[i*kc*MR], &B[j*kc*NR],
+ beta,
+ &C[i*MR*incRowC+j*NR*incColC],
+ incRowC, incColC);
+ } else {
+ std::fill_n(C_, MR*NR, T(0));
+ ugemm<Index, T, TC, BlockSize>(kc, alpha,
+ &A[i*kc*MR], &B[j*kc*NR],
+ T(0),
+ C_, NR, Index(1));
+ gescal(mr, nr, beta,
+ &C[i*MR*incRowC+j*NR*incColC],
+ incRowC, incColC);
+ geaxpy(mr, nr, T(1), C_, NR, Index(1),
+ &C[i*MR*incRowC+j*NR*incColC],
+ incRowC, incColC);
+ }
+ }
+ }
+ }
+
+ //-- Packing blocks ------------------------------------------------------------
+ template <typename E, typename T, typename BlockSize>
+ void
+ pack_A(const matrix_expression<E> &A, T *p)
+ {
+ typedef typename E::size_type size_type;
+
+ const size_type mc = A ().size1();
+ const size_type kc = A ().size2();
+ static const size_type MR = BlockSize::mr;
+ const size_type mp = (mc+MR-1) / MR;
+
+ for (size_type j=0; j<kc; ++j) {
+ for (size_type l=0; l<mp; ++l) {
+ for (size_type i0=0; i0<MR; ++i0) {
+ size_type i = l*MR + i0;
+ size_type nu = l*MR*kc + j*MR + i0;
+ p[nu] = (i<mc) ? A()(i,j) : T(0);
+ }
+ }
+ }
+ }
+
+ template <typename E, typename T, typename BlockSize>
+ void
+ pack_B(const matrix_expression<E> &B, T *p)
+ {
+ typedef typename E::size_type size_type;
+
+ const size_type kc = B ().size1();
+ const size_type nc = B ().size2();
+ static const size_type NR = BlockSize::nr;
+ const size_type np = (nc+NR-1) / NR;
+
+ for (size_type l=0; l<np; ++l) {
+ for (size_type j0=0; j0<NR; ++j0) {
+ for (size_type i=0; i<kc; ++i) {
+ size_type j = l*NR+j0;
+ size_type nu = l*NR*kc + i*NR + j0;
+ p[nu] = (j<nc) ? B()(i,j) : T(0);
+ }
+ }
+ }
+ }
+
+ //-- Frame routine -------------------------------------------------------------
+ template <typename E1, typename E2, typename E3, typename BlockSize>
+ void
+ gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
+ const matrix_expression<E2> &e2,
+ typename E3::value_type beta, matrix_expression<E3> &e3)
+ {
+ typedef typename E3::size_type size_type;
+ typedef typename E1::value_type value_type1;
+ typedef typename E2::value_type value_type2;
+ typedef typename E3::value_type value_type3;
+ typedef typename common_type<value_type1, value_type2>::type value_type_i;
+
+ static const size_type MC = BlockSize::mc;
+ static const size_type NC = BlockSize::nc;
+
+ const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
+ const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
+ const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+ static const size_type KC = BlockSize::kc;
+ const size_type mb = (m+MC-1) / MC;
+ const size_type nb = (n+NC-1) / NC;
+ const size_type kb = (k+KC-1) / KC;
+ const size_type mc_ = m % MC;
+ const size_type nc_ = n % NC;
+ const size_type kc_ = k % KC;
+
+ value_type3 *C_ = &e3()(0,0);
+ const size_type incRowC = &e3()(1,0) - &e3()(0,0);
+ const size_type incColC = &e3()(0,1) - &e3()(0,0);
+ value_type_i *A =
+ static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
+ sizeof(value_type_i) * MC*KC));
+ value_type_i *B =
+ static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
+ sizeof(value_type_i) * KC*NC));
+
+ if (alpha==value_type3(0) || k==0) {
+ gescal(beta, e3);
+ return;
+ }
+
+ for (size_type j=0; j<nb; ++j) {
+ size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;
+
+ for (size_type l=0; l<kb; ++l) {
+ size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
+ value_type3 beta_ = (l==0) ? beta : value_type3(1);
+
+ const matrix_range<const E2> Bs = subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc);
+ pack_B<matrix_range<const E2>, value_type_i, BlockSize>(Bs, B);
+
+ for (size_type i=0; i<mb; ++i) {
+ size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;
+
+ const matrix_range<const E1> As = subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
+ pack_A<matrix_range<const E1>, value_type_i, BlockSize>(As, A);
+
+ mgemm<size_type, value_type_i, value_type3, BlockSize>(mc, nc, kc, alpha, A, B, beta_,
+ &C_[i*MC*incRowC+j*NC*incColC],
+ incRowC, incColC);
+ }
+ }
+ }
+ boost::alignment::aligned_free(A);
+ boost::alignment::aligned_free(B);
+ }
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index a363130..22bdb44 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -14,6 +14,7 @@
#define _BOOST_UBLAS_MATRIX_EXPRESSION_
#include <boost/numeric/ublas/vector_expression.hpp>
+#include <boost/numeric/ublas/detail/gemm.hpp>
// Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
// Iterators based on ideas of Jeremy Siek
@@ -5460,20 +5461,40 @@ namespace boost { namespace numeric { namespace ublas {
expression2_closure_type e2_;
};
+ namespace detail {
+ template<class E1, class E2, class P, bool s>
+ struct binary_calculate_result_type;
+
+ template<class E1, class E2, class P>
+ struct binary_calculate_result_type<E1, E2, P, false> {
+ typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type;
+ };
+
+ // TODO: should elaborate on this for some dense types.
+ template<class E1, class E2, class P>
+ struct binary_calculate_result_type<E1, E2, P, true> {
+ typedef matrix<P> result_type;
+ };
+ }
+
template<class T1, class E1, class T2, class E2>
struct matrix_matrix_binary_traits {
- typedef unknown_storage_tag storage_category;
+ // typedef unknown_storage_tag storage_category;
+ typedef typename storage_restrict_traits<typename E1::storage_category, typename E2::storage_category>::storage_category storage_category;
typedef unknown_orientation_tag orientation_category;
typedef typename promote_traits<T1, T2>::promote_type promote_type;
typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, promote_type> > expression_type;
#ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG
- typedef expression_type result_type;
+ // typedef expression_type result_type;
+ typedef typename detail::binary_calculate_result_type<E1, E2, promote_type, boost::is_base_of<dense_proxy_tag, storage_category>::value>::result_type result_type;
#else
typedef typename E1::matrix_temporary_type result_type;
#endif
};
- template<class E1, class E2>
+ template<class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
typename matrix_matrix_binary_traits<typename E1::value_type, E1,
typename E2::value_type, E2>::result_type
@@ -5481,13 +5502,41 @@ namespace boost { namespace numeric { namespace ublas {
const matrix_expression<E2> &e2,
unknown_storage_tag,
unknown_orientation_tag) {
+
typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
typename E2::value_type, E2>::expression_type expression_type;
return expression_type (e1 (), e2 ());
}
+ template<class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
+ BOOST_UBLAS_INLINE
+ typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+ typename E2::value_type, E2>::result_type
+ prod (const matrix_expression<E1> &e1,
+ const matrix_expression<E2> &e2,
+ dense_proxy_tag,
+ unknown_orientation_tag) {
+ typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
+ typename E2::value_type, E2>::expression_type expression_type;
+ typedef typename E1::matrix_temporary_type result_type;
+ typedef typename result_type::value_type result_value;
+
+ if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) {
+ return expression_type (e1 (), e2 ());
+ } else {
+ result_type rv(e1 ().size1(), e2 ().size2());
+ detail::gemm<E1, E2, result_type, B>(result_value(1), e1, e2,
+ result_value(0), rv);
+ return rv;
+ }
+ }
+
// Dispatcher
- template<class E1, class E2>
+ template<class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
typename matrix_matrix_binary_traits<typename E1::value_type, E1,
typename E2::value_type, E2>::result_type
@@ -5498,7 +5547,7 @@ namespace boost { namespace numeric { namespace ublas {
typename E2::value_type, E2>::storage_category storage_category;
typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1,
typename E2::value_type, E2>::orientation_category orientation_category;
- return prod (e1, e2, storage_category (), orientation_category ());
+ return prod<E1, E2, B> (e1, e2, storage_category (), orientation_category ());
}
template<class E1, class E2>
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 64657cc..80bfab6 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -14,7 +14,7 @@
#define _BOOST_UBLAS_OPERATION_
#include <boost/numeric/ublas/matrix_proxy.hpp>
-
+#include <boost/numeric/ublas/detail/gemm.hpp>
/** \file operation.hpp
* \brief This file contains some specialized products.
*/
@@ -637,13 +637,45 @@ namespace boost { namespace numeric { namespace ublas {
return m;
}
- // Dispatcher
- template<class M, class E1, class E2, class TRI>
+ template<class M, class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
+ BOOST_UBLAS_INLINE
+ M&
+ axpy_prod (const matrix_expression<E1> &e1,
+ const matrix_expression<E2> &e2,
+ M &m, full,
+ dense_proxy_tag, bool init = true)
+ {
+ typedef typename M::value_type value_type;
+
+ if (m.size1() < B::limit || m.size2() < B::limit) {
+ typedef typename M::storage_category storage_category;
+ typedef typename M::orientation_category orientation_category;
+
+ if (init)
+ m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+ return axpy_prod (e1, e2, m, full (), storage_category (),
+ orientation_category ());
+ } else {
+
+
+ detail::gemm<E1, E2, M, B>(value_type(1), e1, e2,
+ value_type(init? 0 : 1), m);
+ return m;
+ }
+ }
+
+ // Dispatchers
+ template<class M, class E1, class E2, class TRI, class S,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
M &
axpy_prod (const matrix_expression<E1> &e1,
const matrix_expression<E2> &e2,
- M &m, TRI, bool init = true) {
+ M &m, TRI,
+ S, bool init = true) {
typedef typename M::value_type value_type;
typedef typename M::storage_category storage_category;
typedef typename M::orientation_category orientation_category;
@@ -651,9 +683,31 @@ namespace boost { namespace numeric { namespace ublas {
if (init)
m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
- return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ());
+
+ return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (),
+ orientation_category ());
}
- template<class M, class E1, class E2, class TRI>
+
+
+ template<class M, class E1, class E2, class TRI,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
+ BOOST_UBLAS_INLINE
+ M &
+ axpy_prod (const matrix_expression<E1> &e1,
+ const matrix_expression<E2> &e2,
+ M &m, TRI, bool init = true) {
+ typedef typename M::value_type value_type;
+ typedef typename M::storage_category storage_category;
+ typedef typename M::orientation_category orientation_category;
+ typedef TRI triangular_restriction;
+
+ return axpy_prod<M, E1, E2, TRI, B> (e1, e2, m, triangular_restriction (),
+ storage_category (), init);
+ }
+ template<class M, class E1, class E2, class TRI,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
M
axpy_prod (const matrix_expression<E1> &e1,
@@ -663,7 +717,8 @@ namespace boost { namespace numeric { namespace ublas {
typedef TRI triangular_restriction;
matrix_type m (e1 ().size1 (), e2 ().size2 ());
- return axpy_prod (e1, e2, m, triangular_restriction (), true);
+ return axpy_prod<M, E1, E2, TRI, B> (e1, e2, m, triangular_restriction (),
+ true);
}
/** \brief computes <tt>M += A X</tt> or <tt>M = A X</tt> in an
@@ -690,7 +745,9 @@ namespace boost { namespace numeric { namespace ublas {
\param E1 type of a matrix expression \c A
\param E2 type of a matrix expression \c X
*/
- template<class M, class E1, class E2>
+ template<class M, class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
M &
axpy_prod (const matrix_expression<E1> &e1,
@@ -700,11 +757,15 @@ namespace boost { namespace numeric { namespace ublas {
typedef typename M::storage_category storage_category;
typedef typename M::orientation_category orientation_category;
- if (init)
- m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
- return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ());
+ // if (init)
+ // m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ()));
+ return axpy_prod<M, E1, E2, B> (e1, e2, m, full (), storage_category (),
+ init);
}
- template<class M, class E1, class E2>
+
+ template<class M, class E1, class E2,
+ typename B = detail::prod_block_size<typename common_type<typename E1::value_type,
+ typename E2::value_type>::type> >
BOOST_UBLAS_INLINE
M
axpy_prod (const matrix_expression<E1> &e1,
@@ -712,7 +773,7 @@ namespace boost { namespace numeric { namespace ublas {
typedef M matrix_type;
matrix_type m (e1 ().size1 (), e2 ().size2 ());
- return axpy_prod (e1, e2, m, full (), true);
+ return axpy_prod<M, E1, E2, B> (e1, e2, m, full (), true);
}
--
1.9.1