$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-03 14:53:45
Hi Michael,
first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz
I also realised, that we start to become memory bound.
I am getting approximately 10% improvement  with the following setup on my old AMD box:
template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
               const T *A, const T *B,
     T beta,
     T *C, Index incRowC, Index incColC)
{
   A = (const T*) __builtin_assume_aligned (A, 128);
   B = (const T*) __builtin_assume_aligned (B, 128);
   
   static const unsigned div = 32/sizeof(T);
   static const Index MR = BlockSize<T>::MR;
   static const Index NR = BlockSize<T>::NR/div;
   typedef T vx __attribute__((vector_size (32)));
   vx P[MR*NR] __attribute__ ((aligned (128))) = {};
   const vx *B_ = (const vx *)B;
   for (Index l=0; l<kc; ++l) {
     if (!(kc&3))
           __builtin_prefetch(A + 16, 0);
       for (Index i=0; i<MR; ++i) {
           for (Index j=0; j<(NR); ++j) {
               P[i * NR + j] += A[i]*B_[j];
           }
       }
       A += MR;
       B_ += NR;
   }
   T *P_ = (T *)P;
   for (Index j=0; j<NR*div; ++j) {
       for (Index i=0; i<MR; ++i) {
           C[i*incRowC+j*incColC] *= beta;
           C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
       }
   }
}
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;
    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];
        __builtin_prefetch(B + j * kc * NR, 0);
        __builtin_prefetch(B + j * kc * NR + 16, 0);
        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
            __builtin_prefetch(A + i * kc * MR, 0);
            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}
MR=4, NR=8, KC=1024
Compile:
g++ -std=gnu++11 -Wall -W -mavx -Ofast -DNDEBUG -DM_MAX=1500 -g -o matprod5 matprod5.cc 
Baseline:
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100   100   0.00172933      1156.52      0.00031314      6386.92               0
  200   200   200    0.0051078      3132.46      0.00181692      8806.12               0
  300   300   300    0.0166265      3247.83      0.00569728       9478.2               0
  400   400   400    0.0409126      3128.62       0.0134713      9501.71               0
  500   500   500    0.0908395      2752.11       0.0242485      10309.9               0
  600   600   600     0.187145      2308.37       0.0376499      11474.1               0
  700   700   700     0.306176      2240.54       0.0588034        11666               0
  800   800   800     0.463066      2211.35       0.0878199      11660.2               0
  900   900   900     0.867082       1681.5        0.125935      11577.4               0
 1000  1000  1000      1.20465      1660.24        0.171337      11672.9               0
 1100  1100  1100      1.63398      1629.15        0.230289      11559.4     3.76457e-19
 1200  1200  1200      2.09053      1653.17         0.29553      11694.2     3.78417e-19
 1300  1300  1300        2.672      1644.46        0.376785      11661.8     3.19717e-19
 1400  1400  1400      3.32236      1651.84        0.461209      11899.2     2.57872e-19
 1500  1500  1500      4.09372      1648.87        0.567471      11894.9     2.05188e-19
with prefetch:
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100   100   0.00197408      1013.13     0.000814521      2455.43               0
  200   200   200   0.00571277      2800.74      0.00206576      7745.32               0
  300   300   300    0.0170868      3160.33      0.00547799      9857.63               0
  400   400   400    0.0401293      3189.69       0.0117615      10882.9               0
  500   500   500    0.0888678      2813.17       0.0220389      11343.6               0
  600   600   600     0.188413      2292.84       0.0361003      11966.7               0
  700   700   700      0.31356      2187.78       0.0561651        12214               0
  800   800   800     0.457264      2239.41       0.0836534        12241               0
  900   900   900     0.869343      1677.13        0.117946      12361.6               0
 1000  1000  1000      1.25295      1596.23        0.160194      12484.9               0
 1100  1100  1100      1.62086      1642.34        0.220637      12065.1     3.75611e-19
 1200  1200  1200      2.07918       1662.2        0.279095      12382.9      3.7818e-19
 1300  1300  1300      2.66945      1646.03        0.352817      12454.1     3.19286e-19
 1400  1400  1400      3.30731      1659.35        0.429694      12771.9     2.58137e-19
 1500  1500  1500      4.09499      1648.36        0.527458      12797.2        2.05e-19
--------------------------------------------
On Tue, 2/2/16, Michael Lehn <michael.lehn_at_[hidden]> wrote:
 Subject: Re: [ublas] Matrix multiplication performance
 To: "ublas mailing list" <ublas_at_[hidden]>
 Cc: "palik imre" <imre_palik_at_[hidden]>
 Date: Tuesday, 2 February, 2016, 23:26
 
 I also
 started to write down some notes on the rational behind the
 micro-kernel:
 http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session6/page01.html
 
 On 02 Feb 2016, at
 21:43, Michael Lehn <michael.lehn_at_[hidden]>
 wrote:
 (So the pages are re-done
 â¦)
 
 
 The bug
 is because NR is BlockSize<T>::NR divided by the
 vector length.  So thisshould work
 //---------------------------------------------------------------------------------template
 <typename Index, typename T>typename
 std::enable_if<std::is_floating_point<T>::value,void>::typeugemm(Index
 kc, T alpha,               const T *A,
 const T *B,     T beta,     T
 *C, Index incRowC, Index incColC){ 
 Â A = (const T*) __builtin_assume_aligned (A,
 128);   B = (const T*) __builtin_assume_aligned
 (B, 128);   static const unsigned div =
 32/sizeof(T);   static const Index MR =
 BlockSize<T>::MR;   static const Index NR
 = BlockSize<T>::NR/div;
 Â   typedef T vx
 __attribute__((vector_size (32)));
 Â   vx P[MR*NR] __attribute__
 ((aligned (128))) = {};   const vx *B_ = (vx
 *)B;   for (Index l=0; l<kc; ++l)
 {       for (Index i=0; i<MR; ++i)
 {           for (Index j=0; j<(NR); ++j)
 {               P[i * NR + j] +=
 A[i]*B_[j];           }     
 Â }       A += MR;       B_ +=
 NR;   }
 Â   T *P_ = (T *)P; 
 Â for (Index j=0; j<NR*div; ++j) {     
 Â for (Index i=0; i<MR; ++i) {         
 Â C[i*incRowC+j*incColC] *= beta;         
 Â C[i*incRowC+j*incColC] += alpha*P_[i * NR * div +
 j];       } 
 Â }}//---------------------------------------------------------------------------------
 However, as Karl pointed out it also
 should treat the case beta==0as a special
 case.
 But I donât
 get any speedup.  For the performance relevant
 ismainly the loop
 Â   for (Index l=0; l<kc;
 ++l) {       for (Index i=0; i<MR; ++i)
 {           for (Index j=0; j<(NR); ++j)
 {               P[i * NR + j] +=
 A[i]*B_[j];           }     
 Â }       A += MR;       B_ +=
 NR;   }
 From my experience, only if this is
 optimized to the extrem a little bitextra
 performance can be achieved by treating some special cases
 forthe update
 Â   T *P_ = (T
 *)P;   for (Index j=0; j<NR*div; ++j)
 {       for (Index i=0; i<MR; ++i)
 {           C[i*incRowC+j*incColC] *=
 beta;           C[i*incRowC+j*incColC] +=
 alpha*P_[i * NR * div + j];     
 Â }   }
 The main reason my code looks weird
 is that I also want to use itfor teaching.  The
 way it is coded you easily can map it forth
 andback with the assembly code.
 
 new
 kernel:
 
 template
 <typename Index, typename T>
 typename
 std::enable_if<std::is_floating_point<T>::value,
 void>::type
 ugemm(Index kc,
 T alpha,
 Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const
 T *A, const T *B,
 Â Â Â Â Â T beta,
 Â Â Â Â Â T *C, Index incRowC, Index
 incColC)
 {
 Â Â Â A = (const
 T*) __builtin_assume_aligned (A, 128);
 Â Â Â B = (const T*) __builtin_assume_aligned
 (B, 128);
 Â Â Â static const unsigned div =
 32/sizeof(T);
 Â Â Â static const Index MR =
 BlockSize<T>::MR;
 Â Â Â static const
 Index NR = BlockSize<T>::NR/div;
 
 Â Â Â typedef T vx __attribute__((vector_size
 (32)));
 
 Â Â Â vx P[MR*NR]
 __attribute__ ((aligned (128))) = {};
 Â Â Â const vx *B_ = (vx *)B;
 Â Â Â for (Index l=0; l<kc; ++l) {
 Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
 {
 Â Â Â Â Â Â Â Â Â Â Â for (Index j=0;
 j<(NR); ++j) {
 Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â P[i * NR + j] +=
 A[i]*B_[j];
 Â Â Â Â Â Â Â Â Â Â Â }
 Â Â Â Â Â Â Â }
 Â Â Â Â Â Â Â A
 += MR;
 Â Â Â Â Â Â Â B_ += NR;
 Â Â Â }
 
 Â Â Â T *P_ = (T *)P;
 Â Â Â for
 (Index j=0; j<NR; ++j) {
 Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
 {
 Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] *=
 beta;
 Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] +=
 alpha*P_[i * NR + j];
 Â Â Â Â Â Â Â }
 Â Â Â }
 }
 
 _______________________________________________
 ublas mailing list
 ublas_at_[hidden]
 http://listarchives.boost.org/mailman/listinfo.cgi/ublas
 Sent to: michael.lehn_at_[hidden]