Ublas mailing page: Re: [ublas] Matrix multiplication performance

Date view	Thread view	Subject view	Author view

Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-02 10:16:34

Next message: palik imre: "Re: [ublas] Matrix multiplication performance"
Previous message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
In reply to: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Next in thread: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Hi Michael,

If you remove the weird stuff of the out copy loop, it is even faster

your stuff:
$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -I gemm -g -o matprod matprod.cc

$ ./matprod
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
100 100 100 0.00044688 4475.47 0.00021836 9159.19 0
200 200 200 0.00482106 3318.77 0.00111174 14391.8 0
300 300 300 0.0165459 3263.65 0.00280313 19264.2 1.92763e-16
400 400 400 0.03396 3769.13 0.00515972 24807.5 8.35381e-17
500 500 500 0.0563871 4433.64 0.00810017 30863.5 3.47826e-17
600 600 600 0.0875898 4932.08 0.0132554 32590.6 1.61585e-17
700 700 700 0.138829 4941.34 0.0203247 33752.1 8.36893e-18
800 800 800 0.210452 4865.73 0.0312828 32733.7 4.69498e-18
900 900 900 0.299937 4861.01 0.0424129 34376.3 2.78153e-18
1000 1000 1000 0.410234 4875.26 0.0569512 35117.8 1.75386e-18
1100 1100 1100 0.543506 4897.83 0.0803772 33118.8 1.14058e-18
1200 1200 1200 0.70383 4910.28 0.0997944 34631.2 7.74047e-19
1300 1300 1300 0.899555 4884.64 0.133048 33025.7 5.4207e-19
1400 1400 1400 1.19975 4574.3 0.158674 34586.6 3.87594e-19
1500 1500 1500 1.5232 4431.46 0.19405 34784.8 2.84798e-19

new kernel:

template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
                const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    A = (const T*) __builtin_assume_aligned (A, 128);
    B = (const T*) __builtin_assume_aligned (B, 128);
    static const unsigned div = 32/sizeof(T);
    static const Index MR = BlockSize<T>::MR;
    static const Index NR = BlockSize<T>::NR/div;

typedef T vx __attribute__((vector_size (32)));

    vx P[MR*NR] __attribute__ ((aligned (128))) = {};
    const vx *B_ = (vx *)B;
    for (Index l=0; l<kc; ++l) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<(NR); ++j) {
                P[i * NR + j] += A[i]*B_[j];
            }
        }
        A += MR;
        B_ += NR;
    }

    T *P_ = (T *)P;
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
        }
    }
}

$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -I gemm -g -o matprod5 matprod5.cc
$ ./matprod5
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
100 100 100 0.000474071 4218.78 0.00018245 10961.9 718.441
200 200 200 0.00506416 3159.46 0.000934905 17114 22.4352
300 300 300 0.0162785 3317.26 0.00231444 23331.8 2.98619
400 400 400 0.0334068 3831.56 0.00428301 29885.5 0.718131
500 500 500 0.0557897 4481.11 0.00691865 36134.2 0.233594
600 600 600 0.0868364 4974.87 0.0108195 39927.9 0.0937454
700 700 700 0.137425 4991.81 0.0160635 42705.6 0.0433091
800 800 800 0.210233 4870.79 0.0232583 44027.3 0.0223228
900 900 900 0.300065 4858.95 0.033086 44067 0.0123832
1000 1000 1000 0.410601 4870.91 0.0453549 44096.7 0.00730946
1100 1100 1100 0.547655 4860.72 0.0615887 43222.2 0.00453598
1200 1200 1200 0.707616 4884.01 0.0745208 46376.3 0.00293058
1300 1300 1300 0.89943 4885.32 0.1005 43721.5 0.00196592
1400 1400 1400 1.1228 4887.78 0.118743 46217.4 0.00135943
1500 1500 1500 1.5354 4396.25 0.148832 45353.2 0.000962909

And just for fun:

$ g++ -std=gnu++11 -Ofast -mfma -DNDEBUG -DM_MAX=1500 -DM_TYPE=float -I gemm -g -o matprod5 matprod5.cc
$ ./matprod5
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
100 100 100 0.000253513 7889.14 0.000113096 17684.1 1415.42
200 200 200 0.00161286 9920.29 0.000486492 32888.5 43.4852
300 300 300 0.00879973 6136.55 0.00113009 47783.8 5.67916
400 400 400 0.0195111 6560.37 0.00202414 63236.7 1.36924
500 500 500 0.0341183 7327.44 0.00326899 76476.1 0.448506
600 600 600 0.0495025 8726.84 0.0049251 87714 0.184488
700 700 700 0.0709753 9665.33 0.00707468 96965.6 0.0854384
800 800 800 0.103384 9904.79 0.00998934 102509 0.0440716
900 900 900 0.146694 9939.08 0.0138675 105138 0.0244022
1000 1000 1000 0.202134 9894.42 0.0182289 109716 0.0144046
1100 1100 1100 0.267507 9951.13 0.0237599 112037 0.00890405
1200 1200 1200 0.34953 9887.57 0.0297219 116278 0.00572881
1300 1300 1300 0.436964 10055.7 0.0376092 116833 0.00386137
1400 1400 1400 0.556049 9869.64 0.0460145 119267 0.00267486
1500 1500 1500 0.668857 10091.8 0.0557639 121046 0.00189497

All is ran with MR=4, NR=12

On Saturday, 30 January 2016, 17:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
Here the same in slightly more general form

template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR/4;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));

A = (const double*) __builtin_assume_aligned (A, 32);
B = (const double*) __builtin_assume_aligned (B, 32);

vx P[MR*NR] = {};

for (Index l=0; l<kc; ++l) {
const vx *b = (const vx *)B;

        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += NR*4;
    }

    if (alpha!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    for (Index i=0; i<MR; ++i) {
        for (Index j=0; j<NR; ++j) {
            const double *p = (const double *) &P[i*NR+j];
            for (Index j1=0; j1<4; ++j1) {
                C[i*incRowC+(j*4+j1)*incColC] *= beta;
                C[i*incRowC+(j*4+j1)*incColC] += p[j1];
            }
        }
    }
}

On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:

> Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
>
> //-- Micro Kernel --------------------------------------------------------------
> template <typename Index>
> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
> && BlockSize<double>::MR==4
> && BlockSize<double>::NR==12
> && BlockSize<double>::align==32,
> void>::type
> ugemm(Index kc, double alpha,
> const double *A, const double *B,
> double beta,
> double *C, Index incRowC, Index incColC)
> {
> static const Index MR = BlockSize<double>::MR;
> static const Index NR = BlockSize<double>::NR;
> typedef double vx __attribute__((vector_size (4*sizeof(double))));
>
> A = (const double*) __builtin_assume_aligned (A, 32);
> B = (const double*) __builtin_assume_aligned (B, 32);
>
> vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
> vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
> vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
> vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
>
> for (Index l=0; l<kc; ++l) {
> const vx *b = (const vx *)B;
>
> P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
> P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
> P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
> P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
> A += MR;
> B += NR;
> }
>
> P0_03 *= alpha; P0_47 *= alpha; P0_811 *= alpha;
> P1_03 *= alpha; P1_47 *= alpha; P1_811 *= alpha;
> P2_03 *= alpha; P2_47 *= alpha; P2_811 *= alpha;
> P3_03 *= alpha; P3_47 *= alpha; P3_811 *= alpha;
>
> if (beta!=double(1)) {
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<NR; ++j) {
> C[i*incRowC+j*incColC] *= beta;
> }
> }
> }
>
> const double *p = (const double *) &P0_03;
> C[0*incRowC+0*incColC] += p[0];
> C[0*incRowC+1*incColC] += p[1];
> C[0*incRowC+2*incColC] += p[2];
> C[0*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P0_47;
> C[0*incRowC+4*incColC] += p[0];
> C[0*incRowC+5*incColC] += p[1];
> C[0*incRowC+6*incColC] += p[2];
> C[0*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P0_811;
> C[0*incRowC+8*incColC] += p[0];
> C[0*incRowC+9*incColC] += p[1];
> C[0*incRowC+10*incColC] += p[2];
> C[0*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P1_03;
> C[1*incRowC+0*incColC] += p[0];
> C[1*incRowC+1*incColC] += p[1];
> C[1*incRowC+2*incColC] += p[2];
> C[1*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P1_47;
> C[1*incRowC+4*incColC] += p[0];
> C[1*incRowC+5*incColC] += p[1];
> C[1*incRowC+6*incColC] += p[2];
> C[1*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P1_811;
> C[1*incRowC+8*incColC] += p[0];
> C[1*incRowC+9*incColC] += p[1];
> C[1*incRowC+10*incColC] += p[2];
> C[1*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P2_03;
> C[2*incRowC+0*incColC] += p[0];
> C[2*incRowC+1*incColC] += p[1];
> C[2*incRowC+2*incColC] += p[2];
> C[2*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P2_47;
> C[2*incRowC+4*incColC] += p[0];
> C[2*incRowC+5*incColC] += p[1];
> C[2*incRowC+6*incColC] += p[2];
> C[2*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P2_811;
> C[2*incRowC+8*incColC] += p[0];
> C[2*incRowC+9*incColC] += p[1];
> C[2*incRowC+10*incColC] += p[2];
> C[2*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P3_03;
> C[3*incRowC+0*incColC] += p[0];
> C[3*incRowC+1*incColC] += p[1];
> C[3*incRowC+2*incColC] += p[2];
> C[3*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P3_47;
> C[3*incRowC+4*incColC] += p[0];
> C[3*incRowC+5*incColC] += p[1];
> C[3*incRowC+6*incColC] += p[2];
> C[3*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P3_811;
> C[3*incRowC+8*incColC] += p[0];
> C[3*incRowC+9*incColC] += p[1];
> C[3*incRowC+10*incColC] += p[2];
> C[3*incRowC+11*incColC] += p[3];
>
> }

_______________________________________________
ublas mailing list
ublas_at_[hidden]
http://listarchives.boost.org/mailman/listinfo.cgi/ublas
Sent to: imre_palik_at_[hidden]

Next message: palik imre: "Re: [ublas] Matrix multiplication performance"
Previous message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
In reply to: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Next in thread: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: palik imre: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Date view	Thread view	Subject view	Author view