$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: Re: [ublas] Matrix multiplication performance
From: Karl Rupp (rupp_at_[hidden])
Date: 2016-02-02 10:32:39
Hi guys,
I'm sorry to interrupt your performance tuning fun, but please note that 
the kernel does not work correctly. To verify, consider the case that 
beta is equal to zero and that C initially consists of NaNs (which may 
happen in practice if you start if a freshly malloc'ed buffer)...
Best regards,
Karli
On 01/30/2016 05:48 PM, Michael Lehn wrote:
> Here the same in slightly more general form
>
> template <typename Index>
> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
>                       && BlockSize<double>::MR==4
>                       && BlockSize<double>::NR==12
>                       && BlockSize<double>::align==32,
>           void>::type
> ugemm(Index kc, double alpha,
>        const double *A, const double *B,
>        double beta,
>        double *C, Index incRowC, Index incColC)
> {
>      static const Index MR = BlockSize<double>::MR;
>      static const Index NR = BlockSize<double>::NR/4;
>      typedef double vx __attribute__((vector_size (4*sizeof(double))));
>
>      A = (const double*) __builtin_assume_aligned (A, 32);
>      B = (const double*) __builtin_assume_aligned (B, 32);
>
>      vx P[MR*NR] = {};
>
>      for (Index l=0; l<kc; ++l) {
>          const vx *b = (const vx *)B;
>
>          for (Index i=0; i<MR; ++i) {
>              for (Index j=0; j<NR; ++j) {
>                  P[i*NR+j] += A[i]*b[j];
>              }
>          }
>          A += MR;
>          B += NR*4;
>      }
>
>      if (alpha!=double(1)) {
>          for (Index i=0; i<MR; ++i) {
>              for (Index j=0; j<NR; ++j) {
>                  P[i*NR+j] *= alpha;
>              }
>          }
>      }
>
>      for (Index i=0; i<MR; ++i) {
>          for (Index j=0; j<NR; ++j) {
>              const double *p = (const double *) &P[i*NR+j];
>              for (Index j1=0; j1<4; ++j1) {
>                  C[i*incRowC+(j*4+j1)*incColC] *= beta;
>                  C[i*incRowC+(j*4+j1)*incColC] += p[j1];
>              }
>          }
>      }
> }
>
> On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
>
>> Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
>>
>> //-- Micro Kernel --------------------------------------------------------------
>> template <typename Index>
>> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
>>                      && BlockSize<double>::MR==4
>>                      && BlockSize<double>::NR==12
>>                      && BlockSize<double>::align==32,
>>          void>::type
>> ugemm(Index kc, double alpha,
>>       const double *A, const double *B,
>>       double beta,
>>       double *C, Index incRowC, Index incColC)
>> {
>>     static const Index MR = BlockSize<double>::MR;
>>     static const Index NR = BlockSize<double>::NR;
>>     typedef double vx __attribute__((vector_size (4*sizeof(double))));
>>
>>     A = (const double*) __builtin_assume_aligned (A, 32);
>>     B = (const double*) __builtin_assume_aligned (B, 32);
>>
>>     vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
>>     vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
>>     vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
>>     vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
>>
>>     for (Index l=0; l<kc; ++l) {
>>         const vx *b = (const vx *)B;
>>
>>         P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
>>         P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
>>         P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
>>         P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
>>         A += MR;
>>         B += NR;
>>     }
>>
>>     P0_03 *= alpha;  P0_47 *= alpha; P0_811 *= alpha;
>>     P1_03 *= alpha;  P1_47 *= alpha; P1_811 *= alpha;
>>     P2_03 *= alpha;  P2_47 *= alpha; P2_811 *= alpha;
>>     P3_03 *= alpha;  P3_47 *= alpha; P3_811 *= alpha;
>>
>>     if (beta!=double(1)) {
>>         for (Index i=0; i<MR; ++i) {
>>             for (Index j=0; j<NR; ++j) {
>>                 C[i*incRowC+j*incColC] *= beta;
>>             }
>>         }
>>     }
>>
>>     const double *p = (const double *) &P0_03;
>>     C[0*incRowC+0*incColC] += p[0];
>>     C[0*incRowC+1*incColC] += p[1];
>>     C[0*incRowC+2*incColC] += p[2];
>>     C[0*incRowC+3*incColC] += p[3];
>>
>>     p = (const double *) &P0_47;
>>     C[0*incRowC+4*incColC] += p[0];
>>     C[0*incRowC+5*incColC] += p[1];
>>     C[0*incRowC+6*incColC] += p[2];
>>     C[0*incRowC+7*incColC] += p[3];
>>
>>     p = (const double *) &P0_811;
>>     C[0*incRowC+8*incColC] += p[0];
>>     C[0*incRowC+9*incColC] += p[1];
>>     C[0*incRowC+10*incColC] += p[2];
>>     C[0*incRowC+11*incColC] += p[3];
>>
>>     p = (const double *) &P1_03;
>>     C[1*incRowC+0*incColC] += p[0];
>>     C[1*incRowC+1*incColC] += p[1];
>>     C[1*incRowC+2*incColC] += p[2];
>>     C[1*incRowC+3*incColC] += p[3];
>>
>>     p = (const double *) &P1_47;
>>     C[1*incRowC+4*incColC] += p[0];
>>     C[1*incRowC+5*incColC] += p[1];
>>     C[1*incRowC+6*incColC] += p[2];
>>     C[1*incRowC+7*incColC] += p[3];
>>
>>     p = (const double *) &P1_811;
>>     C[1*incRowC+8*incColC] += p[0];
>>     C[1*incRowC+9*incColC] += p[1];
>>     C[1*incRowC+10*incColC] += p[2];
>>     C[1*incRowC+11*incColC] += p[3];
>>
>>     p = (const double *) &P2_03;
>>     C[2*incRowC+0*incColC] += p[0];
>>     C[2*incRowC+1*incColC] += p[1];
>>     C[2*incRowC+2*incColC] += p[2];
>>     C[2*incRowC+3*incColC] += p[3];
>>
>>     p = (const double *) &P2_47;
>>     C[2*incRowC+4*incColC] += p[0];
>>     C[2*incRowC+5*incColC] += p[1];
>>     C[2*incRowC+6*incColC] += p[2];
>>     C[2*incRowC+7*incColC] += p[3];
>>
>>     p = (const double *) &P2_811;
>>     C[2*incRowC+8*incColC] += p[0];
>>     C[2*incRowC+9*incColC] += p[1];
>>     C[2*incRowC+10*incColC] += p[2];
>>     C[2*incRowC+11*incColC] += p[3];
>>
>>     p = (const double *) &P3_03;
>>     C[3*incRowC+0*incColC] += p[0];
>>     C[3*incRowC+1*incColC] += p[1];
>>     C[3*incRowC+2*incColC] += p[2];
>>     C[3*incRowC+3*incColC] += p[3];
>>
>>     p = (const double *) &P3_47;
>>     C[3*incRowC+4*incColC] += p[0];
>>     C[3*incRowC+5*incColC] += p[1];
>>     C[3*incRowC+6*incColC] += p[2];
>>     C[3*incRowC+7*incColC] += p[3];
>>
>>     p = (const double *) &P3_811;
>>     C[3*incRowC+8*incColC] += p[0];
>>     C[3*incRowC+9*incColC] += p[1];
>>     C[3*incRowC+10*incColC] += p[2];
>>     C[3*incRowC+11*incColC] += p[3];
>>
>> }
>
> _______________________________________________
> ublas mailing list
> ublas_at_[hidden]
> http://listarchives.boost.org/mailman/listinfo.cgi/ublas
> Sent to: rupp_at_[hidden]
>