$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: [ublas] [PATCH 2/3] boost::ublas: gcc support for optimal matrix multiplication
From: Imre Palik (imre_palik_at_[hidden])
Date: 2016-02-29 02:46:37
This patch contains an optimised, vectorised kernel, using gcc's
SIMD vectors. This reaches matrix multiplication speeds
comparable to hand-crafted assembly kernels on x86 Haswell
microarchitecture.
The patch also contains optimised kerenl parameters for float,
double, and long double.
Signed-off-by: Imre Palik <imrep_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>
---
include/boost/numeric/ublas/detail/block_sizes.hpp | 78 +++++++
include/boost/numeric/ublas/detail/gemm.hpp | 223 +++++++++++----------
include/boost/numeric/ublas/detail/vector.hpp | 27 +++
include/boost/numeric/ublas/matrix_expression.hpp | 1 +
include/boost/numeric/ublas/operation.hpp | 1 +
5 files changed, 223 insertions(+), 107 deletions(-)
create mode 100644 include/boost/numeric/ublas/detail/block_sizes.hpp
create mode 100644 include/boost/numeric/ublas/detail/vector.hpp
diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp
new file mode 100644
index 0000000..a146f10
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/block_sizes.hpp
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2016
+// Michael Lehn, Imre Palik
+//
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_BLOCK_SIZES_
+#define _BOOST_UBLAS_BLOCK_SIZES_
+
+#include <boost/numeric/ublas/detail/vector.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+ template <typename T>
+ struct prod_block_size {
+ static const unsigned vector_length =
+ _BOOST_UBLAS_VECTOR_SIZE > sizeof(T)? _BOOST_UBLAS_VECTOR_SIZE/sizeof(T) : 1; // Number of elements in a vector register
+ static const unsigned mc = 256;
+ static const unsigned kc = 512; // stripe length
+ static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length);
+ static const unsigned mr = 4; // stripe width for lhs
+ static const unsigned nr = 3 * vector_length; // stripe width for rhs
+ static const unsigned align = 64; // align temporary arrays to this boundary
+ static const unsigned limit = 27; // Use gemm from this size
+ BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+ BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+ BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+ };
+
+ template <>
+ struct prod_block_size<float> {
+ static const unsigned mc = 256;
+ static const unsigned kc = 512; // stripe length
+ static const unsigned nc = 4096;
+ static const unsigned mr = 4; // stripe width for lhs
+ static const unsigned nr = 16; // stripe width for rhs
+ static const unsigned align = 64; // align temporary arrays to this boundary
+ static const unsigned limit = 27; // Use gemm from this size
+ static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register
+ BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+ BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+ BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+ };
+
+ template <>
+ struct prod_block_size<long double> {
+ static const unsigned mc = 256;
+ static const unsigned kc = 512; // stripe length
+ static const unsigned nc = 4096;
+ static const unsigned mr = 1; // stripe width for lhs
+ static const unsigned nr = 4; // stripe width for rhs
+ static const unsigned align = 64; // align temporary arrays to this boundary
+ static const unsigned limit = 71; // Use gemm from this size
+ static const unsigned vector_length = 1; // Number of elements in a vector register
+ BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+ BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+ BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+ };
+
+ template<typename T>
+ struct is_blocksize {
+ struct fallback { static const int nr = 0; };
+ struct derived : T, fallback {};
+ template<int C1>
+ struct nonr {
+ static const bool value = false;
+ typedef false_type type;
+ };
+
+ template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
+ template<typename C> static char (&f(...))[2];
+
+ static bool const value = sizeof(f<derived>(0)) == 2;
+ };
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index f66e80f..7fd0ab9 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -10,80 +10,26 @@
#define _BOOST_UBLAS_GEMM_
#include <boost/type_traits/common_type.hpp>
+#include <boost/type_traits/aligned_storage.hpp>
+#include <boost/type_traits/is_arithmetic.hpp>
#include <boost/align/aligned_alloc.hpp>
#include <boost/align/assume_aligned.hpp>
#include <boost/static_assert.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/detail/vector.hpp>
+#include <boost/predef/compiler.h>
namespace boost { namespace numeric { namespace ublas { namespace detail {
- template <typename T>
- struct prod_block_size {
- static const unsigned mc = 256;
- static const unsigned kc = 512; // stripe length
- static const unsigned nc = 4092;
- static const unsigned mr = 4; // stripe width for lhs
- static const unsigned nr = 12; // stripe width for rhs
- static const unsigned align = 64; // align temporary arrays to this boundary
- static const unsigned limit = 26; // Use gemm from this size
- BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
- BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
- BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
- };
-
- template <>
- struct prod_block_size<float> {
- static const unsigned mc = 256;
- static const unsigned kc = 512; // stripe length
- static const unsigned nc = 4080;
- static const unsigned mr = 4; // stripe width for lhs
- static const unsigned nr = 24; // stripe width for rhs
- static const unsigned align = 64; // align temporary arrays to this boundary
- static const unsigned limit = 26; // Use gemm from this size
- BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
- BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
- BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
- };
-
- template <>
- struct prod_block_size<long double> {
- static const unsigned mc = 256;
- static const unsigned kc = 512; // stripe length
- static const unsigned nc = 4096;
- static const unsigned mr = 2; // stripe width for lhs
- static const unsigned nr = 2; // stripe width for rhs
- static const unsigned align = 64; // align temporary arrays to this boundary
- static const unsigned limit = 26; // Use gemm from this size
- BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
- BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
- BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
- };
-
- template<typename T>
- struct is_blocksize {
- struct fallback { static const int nr = 0; };
- struct derived : T, fallback {};
- template<int C1>
- struct nonr {
- static const bool value = false;
- typedef false_type type;
- };
-
- template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
- template<typename C> static char (&f(...))[2];
-
- static bool const value = sizeof(f<derived>(0)) == 2;
- };
-
template <typename E>
void
gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
{
typedef typename E::size_type size_type;
- for (size_type i=0; i<X().size1(); ++i) {
+ for (size_type i=0; i<X().size1(); ++i) {
for (size_type j=0; j<X().size2(); ++j) {
- X()(i,j) *= alpha;
+ X()(i,j) *= alpha;
}
}
}
@@ -114,9 +60,73 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
}
}
- //-- Micro Kernel --------------------------------------------------------------
- template <typename Index, typename T, typename TC, typename BlockSize>
+ //-- Micro Kernel ----------------------------------------------------------
+#if defined(BOOST_COMP_GNUC_DETECTION) \
+ && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)
+ template <typename Index, typename T, typename TC,
+ typename BlockSize>
+ typename enable_if_c<is_arithmetic<T>::value
+ && (1 < BlockSize::vector_length),
+ void>::type
+ ugemm(Index kc, TC alpha, const T *A, const T *B,
+ TC beta, TC *C, Index incRowC, Index incColC)
+ {
+ BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align);
+ BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align);
+ static const unsigned vector_length = BlockSize::vector_length;
+ static const Index MR = BlockSize::mr;
+ static const Index NR = BlockSize::nr/vector_length;
+
+ typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+
+ vx P[MR*NR] = {};
+
+ const vx *b = (const vx *)B;
+ for (Index l=0; l<kc; ++l) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ P[i*NR+j] += A[i]*b[j];
+ }
+ }
+ A += MR;
+ b += NR;
+ }
+
+ if (alpha!=TC(1)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ P[i*NR+j] *= alpha;
+ }
+ }
+ }
+
+ const T *p = (const T *) P;
+ if (beta == TC(0)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR * vector_length; ++j) {
+ C[i*incRowC+j*incColC] = p[i*NR * vector_length +j];
+ }
+ }
+ } else {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR * vector_length; ++j) {
+ C[i*incRowC+j*incColC] *= beta;
+ C[i*incRowC+j*incColC] += p[i*NR * vector_length+j];
+ }
+ }
+ }
+ }
+
+ template <typename Index, typename T, typename TC,
+ typename BlockSize>
+ typename enable_if_c<!is_arithmetic<T>::value
+ || (BlockSize::vector_length <= 1),
+ void>::type
+#else
+ template <typename Index, typename T, typename TC,
+ typename BlockSize>
void
+#endif
ugemm(Index kc, TC alpha, const T *A, const T *B,
TC beta, TC *C, Index incRowC, Index incColC)
{
@@ -124,47 +134,47 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
static const Index MR = BlockSize::mr;
static const Index NR = BlockSize::nr;
- typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
- T *P = reinterpret_cast<T*>(Pa.address());
- for (unsigned c = 0; c < MR * NR; c++)
- P[c] = 0;
+ typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
+ T *P = reinterpret_cast<T*>(Pa.address());
+ for (unsigned c = 0; c < MR * NR; c++)
+ P[c] = 0;
for (Index l=0; l<kc; ++l) {
- for (Index i=0; i<MR; ++i) {
+ for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
P[i* NR+j] += A[i]*B[j];
}
}
- A += MR;
- B += NR;
+ A += MR;
+ B += NR;
+ }
+
+ if (alpha!=TC(1)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
+ P[i*NR+j] *= alpha;
+ }
+ }
}
- if (alpha!=TC(1)) {
- for (Index i=0; i<MR; ++i) {
- for (Index j=0; j<NR; ++j) {
- P[i*NR+j] *= alpha;
- }
- }
- }
-
- if (beta == TC(0)) {
- for (Index i=0; i<MR; ++i) {
- for (Index j=0; j<NR; ++j) {
+ if (beta == TC(0)) {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
C[i*incRowC+j*incColC] = P[i*NR+j];
- }
- }
- } else {
- for (Index i=0; i<MR; ++i) {
- for (Index j=0; j<NR; ++j) {
+ }
+ }
+ } else {
+ for (Index i=0; i<MR; ++i) {
+ for (Index j=0; j<NR; ++j) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += P[i*NR+j];
- }
- }
- }
+ }
+ }
+ }
}
- //-- Macro Kernel --------------------------------------------------------------
- template <typename Index, typename T, typename TC, typename BlockSize>
+ //-- Macro Kernel ----------------------------------------------------------
+ template <typename Index, typename T, typename TC, typename BlockSize>
void
mgemm(Index mc, Index nc, Index kc, TC alpha,
const T *A, const T *B, TC beta,
@@ -181,9 +191,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
// #pragma omp parallel for
// #endif
for (Index j=0; j<np; ++j) {
- // __builtin_prefetch(B + j * kc * NR, 0);
- // __builtin_prefetch(B + j * kc * NR + 8, 0);
- // __builtin_prefetch(B + j * kc * NR + 16, 0);
const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
T C_[MR*NR];
@@ -191,7 +198,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
if (mr==MR && nr==NR) {
- ugemm<Index, T, TC, BlockSize>(kc, alpha,
+ ugemm<Index, T, TC, BlockSize>(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
@@ -214,11 +221,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
}
//-- Packing blocks ------------------------------------------------------------
- template <typename E, typename T, typename BlockSize>
+ template <typename E, typename T, typename BlockSize>
void
pack_A(const matrix_expression<E> &A, T *p)
{
typedef typename E::size_type size_type;
+ BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
const size_type mc = A ().size1();
const size_type kc = A ().size2();
@@ -236,11 +244,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
}
}
- template <typename E, typename T, typename BlockSize>
+ template <typename E, typename T, typename BlockSize>
void
pack_B(const matrix_expression<E> &B, T *p)
{
typedef typename E::size_type size_type;
+ BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
const size_type kc = B ().size1();
const size_type nc = B ().size2();
@@ -262,9 +271,9 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
template <typename E1, typename E2, typename E3, typename BlockSize>
void
gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
- const matrix_expression<E2> &e2,
+ const matrix_expression<E2> &e2,
typename E3::value_type beta, matrix_expression<E3> &e3,
- BlockSize)
+ BlockSize)
{
typedef typename E3::size_type size_type;
typedef typename E1::value_type value_type1;
@@ -276,7 +285,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
static const size_type NC = BlockSize::nc;
const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
- const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
+ const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
static const size_type KC = BlockSize::kc;
@@ -291,11 +300,11 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
const size_type incRowC = &e3()(1,0) - &e3()(0,0);
const size_type incColC = &e3()(0,1) - &e3()(0,0);
value_type_i *A =
- static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
- sizeof(value_type_i) * MC*KC));
+ static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align,
+ sizeof(value_type_i) * MC*KC));
value_type_i *B =
- static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
- sizeof(value_type_i) * KC*NC));
+ static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align,
+ sizeof(value_type_i) * KC*NC));
if (alpha==value_type3(0) || k==0) {
gescal(beta, e3);
@@ -319,13 +328,13 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
pack_A<matrix_range<const E1>, value_type_i, BlockSize>(As, A);
mgemm<size_type, value_type_i, value_type3, BlockSize>(mc, nc, kc, alpha, A, B, beta_,
- &C_[i*MC*incRowC+j*NC*incColC],
- incRowC, incColC);
+ &C_[i*MC*incRowC+j*NC*incColC],
+ incRowC, incColC);
}
}
}
- boost::alignment::aligned_free(A);
- boost::alignment::aligned_free(B);
+ ::boost::alignment::aligned_free(A);
+ ::boost::alignment::aligned_free(B);
}
}}}}
#endif
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
new file mode 100644
index 0000000..1681eda
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -0,0 +1,27 @@
+//
+// Copyright (c) 2016
+// Imre Palik
+//
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_DETAIL_VECTOR_
+#define _BOOST_UBLAS_DETAIL_VECTOR_
+#include <boost/predef/hardware/simd.h>
+
+#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD)
+#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 32
+#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 16
+#else
+#define _BOOST_UBLAS_VECTOR_SIZE 8
+#endif
+#endif
+
+#ifndef _BOOST_UBLAS_VECTOR_SIZE
+#define _BOOST_UBLAS_VECTOR_SIZE 1
+#endif
+
+#endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 78daf2b..c4e0ac8 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -15,6 +15,7 @@
#include <boost/numeric/ublas/vector_expression.hpp>
#include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
// Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
// Iterators based on ideas of Jeremy Siek
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 393a9bd..8fd3b63 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -15,6 +15,7 @@
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
/** \file operation.hpp
* \brief This file contains some specialized products.
*/
--
1.9.1