$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-02 10:16:34
Hi Michael,
If you remove the weird stuff of the out copy loop, it is even faster
your stuff:
$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -I gemm -g -o matprod matprod.cc
$ ./matprod 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100   0.00044688      4475.47      0.00021836      9159.19               0
200   200   200   0.00482106      3318.77      0.00111174      14391.8               0
300   300   300    0.0165459      3263.65      0.00280313      19264.2     1.92763e-16
400   400   400      0.03396      3769.13      0.00515972      24807.5     8.35381e-17
500   500   500    0.0563871      4433.64      0.00810017      30863.5     3.47826e-17
600   600   600    0.0875898      4932.08       0.0132554      32590.6     1.61585e-17
700   700   700     0.138829      4941.34       0.0203247      33752.1     8.36893e-18
800   800   800     0.210452      4865.73       0.0312828      32733.7     4.69498e-18
900   900   900     0.299937      4861.01       0.0424129      34376.3     2.78153e-18
1000  1000  1000     0.410234      4875.26       0.0569512      35117.8     1.75386e-18
1100  1100  1100     0.543506      4897.83       0.0803772      33118.8     1.14058e-18
1200  1200  1200      0.70383      4910.28       0.0997944      34631.2     7.74047e-19
1300  1300  1300     0.899555      4884.64        0.133048      33025.7      5.4207e-19
1400  1400  1400      1.19975       4574.3        0.158674      34586.6     3.87594e-19
1500  1500  1500       1.5232      4431.46         0.19405      34784.8     2.84798e-19
new kernel:
template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
                const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    A = (const T*) __builtin_assume_aligned (A, 128);
    B = (const T*) __builtin_assume_aligned (B, 128);
    static const unsigned div = 32/sizeof(T);
    static const Index MR = BlockSize<T>::MR;
    static const Index NR = BlockSize<T>::NR/div;
    typedef T vx __attribute__((vector_size (32)));
    vx P[MR*NR] __attribute__ ((aligned (128))) = {};
    const vx *B_ = (vx *)B;
    for (Index l=0; l<kc; ++l) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<(NR); ++j) {
                P[i * NR + j] += A[i]*B_[j];
            }
        }
        A += MR;
        B_ += NR;
    }
    T *P_ = (T *)P;
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
        }
    }
}
$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -I gemm -g -o matprod5 matprod5.cc
$ ./matprod5 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100  0.000474071      4218.78      0.00018245      10961.9         718.441
200   200   200   0.00506416      3159.46     0.000934905        17114         22.4352
300   300   300    0.0162785      3317.26      0.00231444      23331.8         2.98619
400   400   400    0.0334068      3831.56      0.00428301      29885.5        0.718131
500   500   500    0.0557897      4481.11      0.00691865      36134.2        0.233594
600   600   600    0.0868364      4974.87       0.0108195      39927.9       0.0937454
700   700   700     0.137425      4991.81       0.0160635      42705.6       0.0433091
800   800   800     0.210233      4870.79       0.0232583      44027.3       0.0223228
900   900   900     0.300065      4858.95        0.033086        44067       0.0123832
1000  1000  1000     0.410601      4870.91       0.0453549      44096.7      0.00730946
1100  1100  1100     0.547655      4860.72       0.0615887      43222.2      0.00453598
1200  1200  1200     0.707616      4884.01       0.0745208      46376.3      0.00293058
1300  1300  1300      0.89943      4885.32          0.1005      43721.5      0.00196592
1400  1400  1400       1.1228      4887.78        0.118743      46217.4      0.00135943
1500  1500  1500       1.5354      4396.25        0.148832      45353.2     0.000962909
And just for fun:
$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -DM_TYPE=float -I gemm -g -o matprod5 matprod5.cc
$ ./matprod5 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100  0.000253513      7889.14     0.000113096      17684.1         1415.42
200   200   200   0.00161286      9920.29     0.000486492      32888.5         43.4852
300   300   300   0.00879973      6136.55      0.00113009      47783.8         5.67916
400   400   400    0.0195111      6560.37      0.00202414      63236.7         1.36924
500   500   500    0.0341183      7327.44      0.00326899      76476.1        0.448506
600   600   600    0.0495025      8726.84       0.0049251        87714        0.184488
700   700   700    0.0709753      9665.33      0.00707468      96965.6       0.0854384
800   800   800     0.103384      9904.79      0.00998934       102509       0.0440716
900   900   900     0.146694      9939.08       0.0138675       105138       0.0244022
1000  1000  1000     0.202134      9894.42       0.0182289       109716       0.0144046
1100  1100  1100     0.267507      9951.13       0.0237599       112037      0.00890405
1200  1200  1200      0.34953      9887.57       0.0297219       116278      0.00572881
1300  1300  1300     0.436964      10055.7       0.0376092       116833      0.00386137
1400  1400  1400     0.556049      9869.64       0.0460145       119267      0.00267486
1500  1500  1500     0.668857      10091.8       0.0557639       121046      0.00189497
All is ran with MR=4, NR=12
On Saturday, 30 January 2016, 17:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
Here the same in slightly more general form
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR/4;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));
    A = (const double*) __builtin_assume_aligned (A, 32);
    B = (const double*) __builtin_assume_aligned (B, 32);
    vx P[MR*NR] = {};
    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += NR*4;
    }
    if (alpha!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }
    for (Index i=0; i<MR; ++i) {
        for (Index j=0; j<NR; ++j) {
            const double *p = (const double *) &P[i*NR+j];
            for (Index j1=0; j1<4; ++j1) {
                C[i*incRowC+(j*4+j1)*incColC] *= beta;
                C[i*incRowC+(j*4+j1)*incColC] += p[j1];
            }
        }
    }
}
On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
> Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
> 
> //-- Micro Kernel --------------------------------------------------------------
> template <typename Index>
> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
>                     && BlockSize<double>::MR==4
>                     && BlockSize<double>::NR==12
>                     && BlockSize<double>::align==32,
>         void>::type
> ugemm(Index kc, double alpha,
>      const double *A, const double *B,
>      double beta,
>      double *C, Index incRowC, Index incColC)
> {
>    static const Index MR = BlockSize<double>::MR;
>    static const Index NR = BlockSize<double>::NR;
>    typedef double vx __attribute__((vector_size (4*sizeof(double))));
> 
>    A = (const double*) __builtin_assume_aligned (A, 32);
>    B = (const double*) __builtin_assume_aligned (B, 32);
> 
>    vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
>    vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
>    vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
>    vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
> 
>    for (Index l=0; l<kc; ++l) {
>        const vx *b = (const vx *)B;
> 
>        P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
>        P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
>        P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
>        P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
>        A += MR;
>        B += NR;
>    }
> 
>    P0_03 *= alpha;  P0_47 *= alpha; P0_811 *= alpha;
>    P1_03 *= alpha;  P1_47 *= alpha; P1_811 *= alpha;
>    P2_03 *= alpha;  P2_47 *= alpha; P2_811 *= alpha;
>    P3_03 *= alpha;  P3_47 *= alpha; P3_811 *= alpha;
> 
>    if (beta!=double(1)) {
>        for (Index i=0; i<MR; ++i) {
>            for (Index j=0; j<NR; ++j) {
>                C[i*incRowC+j*incColC] *= beta;
>            }
>        }
>    }
> 
>    const double *p = (const double *) &P0_03;
>    C[0*incRowC+0*incColC] += p[0];
>    C[0*incRowC+1*incColC] += p[1];
>    C[0*incRowC+2*incColC] += p[2];
>    C[0*incRowC+3*incColC] += p[3];
> 
>    p = (const double *) &P0_47;
>    C[0*incRowC+4*incColC] += p[0];
>    C[0*incRowC+5*incColC] += p[1];
>    C[0*incRowC+6*incColC] += p[2];
>    C[0*incRowC+7*incColC] += p[3];
> 
>    p = (const double *) &P0_811;
>    C[0*incRowC+8*incColC] += p[0];
>    C[0*incRowC+9*incColC] += p[1];
>    C[0*incRowC+10*incColC] += p[2];
>    C[0*incRowC+11*incColC] += p[3];
> 
>    p = (const double *) &P1_03;
>    C[1*incRowC+0*incColC] += p[0];
>    C[1*incRowC+1*incColC] += p[1];
>    C[1*incRowC+2*incColC] += p[2];
>    C[1*incRowC+3*incColC] += p[3];
> 
>    p = (const double *) &P1_47;
>    C[1*incRowC+4*incColC] += p[0];
>    C[1*incRowC+5*incColC] += p[1];
>    C[1*incRowC+6*incColC] += p[2];
>    C[1*incRowC+7*incColC] += p[3];
> 
>    p = (const double *) &P1_811;
>    C[1*incRowC+8*incColC] += p[0];
>    C[1*incRowC+9*incColC] += p[1];
>    C[1*incRowC+10*incColC] += p[2];
>    C[1*incRowC+11*incColC] += p[3];
> 
>    p = (const double *) &P2_03;
>    C[2*incRowC+0*incColC] += p[0];
>    C[2*incRowC+1*incColC] += p[1];
>    C[2*incRowC+2*incColC] += p[2];
>    C[2*incRowC+3*incColC] += p[3];
> 
>    p = (const double *) &P2_47;
>    C[2*incRowC+4*incColC] += p[0];
>    C[2*incRowC+5*incColC] += p[1];
>    C[2*incRowC+6*incColC] += p[2];
>    C[2*incRowC+7*incColC] += p[3];
> 
>    p = (const double *) &P2_811;
>    C[2*incRowC+8*incColC] += p[0];
>    C[2*incRowC+9*incColC] += p[1];
>    C[2*incRowC+10*incColC] += p[2];
>    C[2*incRowC+11*incColC] += p[3];
> 
>    p = (const double *) &P3_03;
>    C[3*incRowC+0*incColC] += p[0];
>    C[3*incRowC+1*incColC] += p[1];
>    C[3*incRowC+2*incColC] += p[2];
>    C[3*incRowC+3*incColC] += p[3];
> 
>    p = (const double *) &P3_47;
>    C[3*incRowC+4*incColC] += p[0];
>    C[3*incRowC+5*incColC] += p[1];
>    C[3*incRowC+6*incColC] += p[2];
>    C[3*incRowC+7*incColC] += p[3];
> 
>    p = (const double *) &P3_811;
>    C[3*incRowC+8*incColC] += p[0];
>    C[3*incRowC+9*incColC] += p[1];
>    C[3*incRowC+10*incColC] += p[2];
>    C[3*incRowC+11*incColC] += p[3];
> 
> }
_______________________________________________
ublas mailing list
ublas_at_[hidden]
http://listarchives.boost.org/mailman/listinfo.cgi/ublas
Sent to: imre_palik_at_[hidden]