Ublas mailing page: Re: [ublas] Matrix multiplication performance

Date view	Thread view	Subject view	Author view

Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-03 14:53:45

Next message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Previous message: Joaquim Duran Comas: "Re: [ublas] Matrix multiplication performance"
Maybe in reply to: palik imre: "[ublas] Matrix multiplication performance"
Next in thread: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Hi Michael,

first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz

I also realised, that we start to become memory bound.
I am getting approximately 10% improvement with the following setup on my old AMD box:

template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
               const T *A, const T *B,
     T beta,
     T *C, Index incRowC, Index incColC)
{
   A = (const T*) __builtin_assume_aligned (A, 128);
   B = (const T*) __builtin_assume_aligned (B, 128);

   static const unsigned div = 32/sizeof(T);
   static const Index MR = BlockSize<T>::MR;
   static const Index NR = BlockSize<T>::NR/div;

typedef T vx __attribute__((vector_size (32)));

   vx P[MR*NR] __attribute__ ((aligned (128))) = {};
   const vx *B_ = (const vx *)B;
   for (Index l=0; l<kc; ++l) {
     if (!(kc&3))
           __builtin_prefetch(A + 16, 0);
       for (Index i=0; i<MR; ++i) {
           for (Index j=0; j<(NR); ++j) {
               P[i * NR + j] += A[i]*B_[j];
           }
       }
       A += MR;
       B_ += NR;
   }

   T *P_ = (T *)P;
   for (Index j=0; j<NR*div; ++j) {
       for (Index i=0; i<MR; ++i) {
           C[i*incRowC+j*incColC] *= beta;
           C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
       }
   }
}

template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp = (mc+MR-1) / MR;
    const Index np = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];
        __builtin_prefetch(B + j * kc * NR, 0);
        __builtin_prefetch(B + j * kc * NR + 16, 0);
        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
            __builtin_prefetch(A + i * kc * MR, 0);
            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

MR=4, NR=8, KC=1024

Compile:
g++ -std=gnu++11 -Wall -W -mavx -Ofast -DNDEBUG -DM_MAX=1500 -g -o matprod5 matprod5.cc

Baseline:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
  100 100 100 0.00172933 1156.52 0.00031314 6386.92 0
  200 200 200 0.0051078 3132.46 0.00181692 8806.12 0
  300 300 300 0.0166265 3247.83 0.00569728 9478.2 0
  400 400 400 0.0409126 3128.62 0.0134713 9501.71 0
  500 500 500 0.0908395 2752.11 0.0242485 10309.9 0
  600 600 600 0.187145 2308.37 0.0376499 11474.1 0
  700 700 700 0.306176 2240.54 0.0588034 11666 0
  800 800 800 0.463066 2211.35 0.0878199 11660.2 0
  900 900 900 0.867082 1681.5 0.125935 11577.4 0
1000 1000 1000 1.20465 1660.24 0.171337 11672.9 0
1100 1100 1100 1.63398 1629.15 0.230289 11559.4 3.76457e-19
1200 1200 1200 2.09053 1653.17 0.29553 11694.2 3.78417e-19
1300 1300 1300 2.672 1644.46 0.376785 11661.8 3.19717e-19
1400 1400 1400 3.32236 1651.84 0.461209 11899.2 2.57872e-19
1500 1500 1500 4.09372 1648.87 0.567471 11894.9 2.05188e-19

with prefetch:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
  100 100 100 0.00197408 1013.13 0.000814521 2455.43 0
  200 200 200 0.00571277 2800.74 0.00206576 7745.32 0
  300 300 300 0.0170868 3160.33 0.00547799 9857.63 0
  400 400 400 0.0401293 3189.69 0.0117615 10882.9 0
  500 500 500 0.0888678 2813.17 0.0220389 11343.6 0
  600 600 600 0.188413 2292.84 0.0361003 11966.7 0
  700 700 700 0.31356 2187.78 0.0561651 12214 0
  800 800 800 0.457264 2239.41 0.0836534 12241 0
  900 900 900 0.869343 1677.13 0.117946 12361.6 0
1000 1000 1000 1.25295 1596.23 0.160194 12484.9 0
1100 1100 1100 1.62086 1642.34 0.220637 12065.1 3.75611e-19
1200 1200 1200 2.07918 1662.2 0.279095 12382.9 3.7818e-19
1300 1300 1300 2.66945 1646.03 0.352817 12454.1 3.19286e-19
1400 1400 1400 3.30731 1659.35 0.429694 12771.9 2.58137e-19
1500 1500 1500 4.09499 1648.36 0.527458 12797.2 2.05e-19

--------------------------------------------
On Tue, 2/2/16, Michael Lehn <michael.lehn_at_[hidden]> wrote:

Subject: Re: [ublas] Matrix multiplication performance
To: "ublas mailing list" <ublas_at_[hidden]>
Cc: "palik imre" <imre_palik_at_[hidden]>
Date: Tuesday, 2 February, 2016, 23:26

I also
started to write down some notes on the rational behind the
micro-kernel:
http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session6/page01.html

On 02 Feb 2016, at
21:43, Michael Lehn <michael.lehn_at_[hidden]>
wrote:
(So the pages are re-done
â€¦)

The bug
is because NR is BlockSize<T>::NR divided by the
vector length. Â So thisshould work
//---------------------------------------------------------------------------------template
<typename Index, typename T>typename
std::enable_if<std::is_floating_point<T>::value,void>::typeugemm(Index
kc, T alpha,Â Â Â Â Â Â Â Â const T *A,
const T *B,Â Â Â T beta,Â Â Â T
*C, Index incRowC, Index incColC){Â
Â A = (const T*) __builtin_assume_aligned (A,
128);Â Â B = (const T*) __builtin_assume_aligned
(B, 128);Â Â static const unsigned div =
32/sizeof(T);Â Â static const Index MR =
BlockSize<T>::MR;Â Â static const Index NR
= BlockSize<T>::NR/div;
Â Â typedef T vx
__attribute__((vector_size (32)));
Â Â vx P[MR*NR] __attribute__
((aligned (128))) = {};Â Â const vx *B_ = (vx
*)B;Â Â for (Index l=0; l<kc; ++l)
{Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â for (Index j=0; j<(NR); ++j)
{Â Â Â Â Â Â Â Â P[i * NR + j] +=
A[i]*B_[j];Â Â Â Â Â Â }Â Â Â
Â }Â Â Â Â A += MR;Â Â Â Â B_ +=
NR;Â Â }
Â Â T *P_ = (T *)P;Â
Â for (Index j=0; j<NR*div; ++j) {Â Â Â
Â for (Index i=0; i<MR; ++i) {Â Â Â Â Â
Â C[i*incRowC+j*incColC] *= beta;Â Â Â Â Â
Â C[i*incRowC+j*incColC] += alpha*P_[i * NR * div +
j];Â Â Â Â }Â
Â }}//---------------------------------------------------------------------------------
However, as Karl pointed out it also
should treat the case beta==0as a special
case.
But I donâ€™t
get any speedup. Â For the performance relevant
ismainly the loop
Â Â for (Index l=0; l<kc;
++l) {Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â for (Index j=0; j<(NR); ++j)
{Â Â Â Â Â Â Â Â P[i * NR + j] +=
A[i]*B_[j];Â Â Â Â Â Â }Â Â Â
Â }Â Â Â Â A += MR;Â Â Â Â B_ +=
NR;Â Â }
From my experience, only if this is
optimized to the extrem a little bitextra
performance can be achieved by treating some special cases
forthe update
Â Â T *P_ = (T
*)P;Â Â for (Index j=0; j<NR*div; ++j)
{Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â C[i*incRowC+j*incColC] *=
beta;Â Â Â Â Â Â C[i*incRowC+j*incColC] +=
alpha*P_[i * NR * div + j];Â Â Â
Â }Â Â }
The main reason my code looks weird
is that I also want to use itfor teaching. Â The
way it is coded you easily can map it forth
andback with the assembly code.

new
kernel:

template
<typename Index, typename T>
typename
std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc,
T alpha,
Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const
T *A, const T *B,
Â Â Â Â Â T beta,
Â Â Â Â Â T *C, Index incRowC, Index
incColC)
{
Â Â Â A = (const
T*) __builtin_assume_aligned (A, 128);
Â Â Â B = (const T*) __builtin_assume_aligned
(B, 128);
Â Â Â static const unsigned div =
32/sizeof(T);
Â Â Â static const Index MR =
BlockSize<T>::MR;
Â Â Â static const
Index NR = BlockSize<T>::NR/div;

Â Â Â typedef T vx __attribute__((vector_size
(32)));

Â Â Â vx P[MR*NR]
__attribute__ ((aligned (128))) = {};
Â Â Â const vx *B_ = (vx *)B;
Â Â Â for (Index l=0; l<kc; ++l) {
Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
{
Â Â Â Â Â Â Â Â Â Â Â for (Index j=0;
j<(NR); ++j) {
Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â P[i * NR + j] +=
A[i]*B_[j];
Â Â Â Â Â Â Â Â Â Â Â }
Â Â Â Â Â Â Â }
Â Â Â Â Â Â Â A
+= MR;
Â Â Â Â Â Â Â B_ += NR;
Â Â Â }

Â Â Â T *P_ = (T *)P;
Â Â Â for
(Index j=0; j<NR; ++j) {
Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
{
Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] *=
beta;
Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] +=
alpha*P_[i * NR + j];
Â Â Â Â Â Â Â }
Â Â Â }
}

_______________________________________________
ublas mailing list
ublas_at_[hidden]
http://listarchives.boost.org/mailman/listinfo.cgi/ublas
Sent to: michael.lehn_at_[hidden]

Next message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Previous message: Joaquim Duran Comas: "Re: [ublas] Matrix multiplication performance"
Maybe in reply to: palik imre: "[ublas] Matrix multiplication performance"
Next in thread: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Date view	Thread view	Subject view	Author view