$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-01 10:41:28
On Haswell, with gcc4.8
My kernel:
$ ./matprod5 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100  0.000448612       4458.2     0.000247042      8095.79               0
200   200   200   0.00465023      3440.69      0.00125286      12770.8               0
300   300   300    0.0149776      3605.38      0.00356466      15148.7               0
400   400   400    0.0310101      4127.69      0.00667965      19162.7               0
500   500   500     0.051142      4888.35        0.010724      23312.2               0
600   600   600    0.0768814      5619.04       0.0171774      25149.3     8.50317e-18
700   700   700     0.120575      5689.42       0.0257486      26642.3     5.79178e-18
800   800   800     0.180436      5675.14       0.0368777      27767.5      3.6748e-18
900   900   900     0.255451      5707.54       0.0542328      26884.1     2.34099e-18
1000  1000  1000     0.351663      5687.26       0.0728293      27461.5     1.53613e-18
1100  1100  1100      0.46676      5703.15        0.102033      26089.7     1.03026e-18
1200  1200  1200     0.605393      5708.69        0.124625      27731.3     7.12511e-19
1300  1300  1300      0.77733      5652.68        0.155321      28289.8     5.07235e-19
1400  1400  1400     0.987963      5554.86        0.197656      27765.4     3.69152e-19
1500  1500  1500      1.30308      5180.03        0.245556      27488.7     2.74421e-19
your assembly kernel:
$ ./matprod 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100  0.000472525      4232.58     0.000214793      9311.29               0
200   200   200   0.00468344      3416.29      0.00114834      13933.1               0
300   300   300    0.0159694      3381.46      0.00284525        18979               0
400   400   400    0.0329509      3884.57      0.00521382      24550.1               0
500   500   500    0.0553715      4514.96      0.00857488      29154.9               0
600   600   600     0.086432      4998.15       0.0136662      31610.7     8.52599e-18
700   700   700     0.135785       5052.1       0.0206901      33155.9     5.81226e-18
800   800   800     0.209187      4895.14       0.0302865      33810.5     3.67676e-18
900   900   900     0.298945      4877.15       0.0421455      34594.4     2.34282e-18
1000  1000  1000     0.408444      4896.63       0.0571358      35004.3     1.53606e-18
1100  1100  1100     0.548342      4854.64       0.0797517      33378.6      1.0315e-18
1200  1200  1200     0.709745      4869.36       0.0965715        35787     7.11162e-19
1300  1300  1300      0.90209      4870.91        0.126747      34667.5     5.07616e-19
1400  1400  1400      1.14301      4801.35        0.152455      35997.4     3.69018e-19
1500  1500  1500       1.4692      4594.34        0.189549      35610.9      2.7435e-19
your vectorised kernel:
$ ./matprod 
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
100   100   100  0.000467843      4274.94     0.000234705      8521.34               0
200   200   200   0.00494453       3235.9      0.00115772      13820.3               0
300   300   300    0.0171149      3155.14        0.002855      18914.2      1.9068e-16
400   400   400    0.0347875      3679.48      0.00526618        24306     8.31453e-17
500   500   500    0.0571516      4374.33      0.00791231      31596.3     3.47186e-17
600   600   600     0.087803      4920.11       0.0130765      33036.4     1.62512e-17
700   700   700     0.138488       4953.5       0.0200052        34291     8.37735e-18
800   800   800     0.211362      4844.77       0.0303868      33698.9     4.68247e-18
900   900   900     0.303289      4807.29       0.0418178      34865.6     2.77558e-18
1000  1000  1000     0.414966      4819.67       0.0563484      35493.5     1.74855e-18
1100  1100  1100     0.548834      4850.29       0.0793732      33537.7     1.14261e-18
1200  1200  1200      0.70661      4890.96       0.0972836        35525     7.73102e-19
1300  1300  1300     0.906953      4844.79        0.128163      34284.4     5.41372e-19
1400  1400  1400      1.14789      4780.95        0.153824      35677.1     3.88091e-19
1500  1500  1500      1.45357      4643.73        0.190417      35448.6     2.84443e-19
I guess I should install gcc 5.* and rerun.
Could somebody run these tests on non-x86 architectures?
If nobody has anything around, I can try to revive my old arm32 laptop, but I'm not sure how relevant is that these days.
Cheers,
Imre
On Saturday, 30 January 2016, 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
//-- Micro Kernel --------------------------------------------------------------
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));
    A = (const double*) __builtin_assume_aligned (A, 32);
    B = (const double*) __builtin_assume_aligned (B, 32);
    vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
    vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
    vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
    vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
        P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
        P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
        P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
        A += MR;
        B += NR;
    }
    P0_03 *= alpha;  P0_47 *= alpha; P0_811 *= alpha;
    P1_03 *= alpha;  P1_47 *= alpha; P1_811 *= alpha;
    P2_03 *= alpha;  P2_47 *= alpha; P2_811 *= alpha;
    P3_03 *= alpha;  P3_47 *= alpha; P3_811 *= alpha;
    if (beta!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                C[i*incRowC+j*incColC] *= beta;
            }
        }
    }
    const double *p = (const double *) &P0_03;
    C[0*incRowC+0*incColC] += p[0];
    C[0*incRowC+1*incColC] += p[1];
    C[0*incRowC+2*incColC] += p[2];
    C[0*incRowC+3*incColC] += p[3];
    p = (const double *) &P0_47;
    C[0*incRowC+4*incColC] += p[0];
    C[0*incRowC+5*incColC] += p[1];
    C[0*incRowC+6*incColC] += p[2];
    C[0*incRowC+7*incColC] += p[3];
    p = (const double *) &P0_811;
    C[0*incRowC+8*incColC] += p[0];
    C[0*incRowC+9*incColC] += p[1];
    C[0*incRowC+10*incColC] += p[2];
    C[0*incRowC+11*incColC] += p[3];
    p = (const double *) &P1_03;
    C[1*incRowC+0*incColC] += p[0];
    C[1*incRowC+1*incColC] += p[1];
    C[1*incRowC+2*incColC] += p[2];
    C[1*incRowC+3*incColC] += p[3];
    p = (const double *) &P1_47;
    C[1*incRowC+4*incColC] += p[0];
    C[1*incRowC+5*incColC] += p[1];
    C[1*incRowC+6*incColC] += p[2];
    C[1*incRowC+7*incColC] += p[3];
    p = (const double *) &P1_811;
    C[1*incRowC+8*incColC] += p[0];
    C[1*incRowC+9*incColC] += p[1];
    C[1*incRowC+10*incColC] += p[2];
    C[1*incRowC+11*incColC] += p[3];
    p = (const double *) &P2_03;
    C[2*incRowC+0*incColC] += p[0];
    C[2*incRowC+1*incColC] += p[1];
    C[2*incRowC+2*incColC] += p[2];
    C[2*incRowC+3*incColC] += p[3];
    p = (const double *) &P2_47;
    C[2*incRowC+4*incColC] += p[0];
    C[2*incRowC+5*incColC] += p[1];
    C[2*incRowC+6*incColC] += p[2];
    C[2*incRowC+7*incColC] += p[3];
    p = (const double *) &P2_811;
    C[2*incRowC+8*incColC] += p[0];
    C[2*incRowC+9*incColC] += p[1];
    C[2*incRowC+10*incColC] += p[2];
    C[2*incRowC+11*incColC] += p[3];
    p = (const double *) &P3_03;
    C[3*incRowC+0*incColC] += p[0];
    C[3*incRowC+1*incColC] += p[1];
    C[3*incRowC+2*incColC] += p[2];
    C[3*incRowC+3*incColC] += p[3];
    p = (const double *) &P3_47;
    C[3*incRowC+4*incColC] += p[0];
    C[3*incRowC+5*incColC] += p[1];
    C[3*incRowC+6*incColC] += p[2];
    C[3*incRowC+7*incColC] += p[3];
    p = (const double *) &P3_811;
    C[3*incRowC+8*incColC] += p[0];
    C[3*incRowC+9*incColC] += p[1];
    C[3*incRowC+10*incColC] += p[2];
    C[3*incRowC+11*incColC] += p[3];
}
[lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DNDEBUG -DBS_D_NR=12 -DBS_D_NC=4092 matprod.cc
[lehn_at_node042 session5]$ ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Res
  100   100   100    0.0010737      1862.72      0.00035514      5631.58               0
  200   200   200   0.00450262      3553.49       0.0015222      10511.1               0
  300   300   300    0.0149401      3614.43      0.00197623      27324.7     1.91339e-16
  400   400   400    0.0306213       4180.1      0.00441727      28977.2     8.30501e-17
  500   500   500    0.0521503      4793.83      0.00755924      33072.1     3.47577e-17
  600   600   600    0.0829816      5205.97       0.0124722      34637.1     1.60972e-17
  700   700   700     0.129317      5304.79       0.0192483      35639.4     8.37943e-18
  800   800   800     0.192606      5316.55       0.0296521      34533.8     4.66971e-18
  900   900   900     0.274752      5306.61       0.0404913      36007.7     2.77631e-18
1000  1000  1000     0.379118      5275.41        0.054979      36377.5     1.74802e-18
1100  1100  1100     0.500672      5316.85       0.0742024      35874.8       1.141e-18
1200  1200  1200     0.646887      5342.51       0.0927714      37252.9      7.7394e-19
1300  1300  1300     0.823663       5334.7        0.119398      36801.2     5.41738e-19
1400  1400  1400      1.02766       5340.3        0.147005      37332.1     3.88125e-19
1500  1500  1500      1.26277      5345.39        0.176592      38223.6     2.85178e-19
1600  1600  1600      1.59651      5131.18         0.22395      36579.7     2.12873e-19
1700  1700  1700      2.66806      3682.82        0.259462      37870.7     1.62096e-19
1800  1800  1800      3.34888      3482.96        0.311059      37497.7     1.25272e-19
1900  1900  1900      4.09763      3347.79        0.371178        36958     9.81479e-20
On 30 Jan 2016, at 13:33, Michael Lehn <michael.lehn_at_[hidden]> wrote:
> On my Haswell your code performs very well.  For N=M=K it reaches single threaded 32GFLOPS
> compared to 37GFLOPS with my FMA kernel.  So I would say this is close!
> 
> I did the flowing two things:
> 
> 1) Compiled with â-mfmaâ
> 2) Used MR=4 NR=8 and the other defaults
> 
> Here the results:
> 
> (A) Your code
> 
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> #   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Res
>  100   100   100  0.000993595      2012.89     0.000402098      4973.91               0
>  200   200   200   0.00508121      3148.86       0.0016022       9986.3               0
>  300   300   300    0.0148755      3630.13      0.00250342      21570.5     1.92273e-16
>  400   400   400    0.0304433      4204.53      0.00562129      22770.6      8.3039e-17
>  500   500   500    0.0515749      4847.32      0.00889942      28091.7     3.47892e-17
>  600   600   600    0.0821662      5257.63       0.0144308        29936      1.6149e-17
>  700   700   700     0.128187      5351.55       0.0224022        30622     8.38824e-18
>  800   800   800     0.190724      5369.01       0.0340685      30057.1     4.69155e-18
>  900   900   900     0.272908      5342.46       0.0474595      30720.9     2.78366e-18
> 1000  1000  1000     0.375997      5319.19       0.0635467      31472.9     1.75117e-18
> 1100  1100  1100     0.504189      5279.76       0.0856357      31085.2     1.14159e-18
> 1200  1200  1200     0.646057      5349.37        0.110654      31232.5     7.74152e-19
> 1300  1300  1300     0.828805      5301.61        0.137722      31904.8     5.43016e-19
> 1400  1400  1400       1.0273      5342.18        0.171288      32039.6     3.87778e-19
> 1500  1500  1500      1.26946      5317.24        0.208467      32379.3     2.85337e-19
> 1600  1600  1600      1.58969      5153.21        0.264495      30972.3     2.13061e-19
> 1700  1700  1700      2.67863       3668.3         0.30419      32302.2     1.62006e-19
> ^C
> 
> (B) My FMA Kernel
> 
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_FMA -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> #   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Res
>  100   100   100    0.0010306      1940.61     0.000378977      5277.37         7.66934
>  200   200   200    0.0049895      3206.74      0.00101729      15728.1        0.236755
>  300   300   300    0.0150318      3592.39      0.00294536      18333.9               0
>  400   400   400    0.0306356      4178.14      0.00495991      25806.9      0.00179347
>  500   500   500     0.052443      4767.08      0.00784496      31867.6     0.000927111
>  600   600   600    0.0824758       5237.9       0.0127618      33850.9     8.49298e-18
>  700   700   700     0.128639      5332.73       0.0192815      35578.1       0.0107258
>  800   800   800     0.192102      5330.51       0.0294848      34729.7      0.00548943
>  900   900   900     0.272509      5350.28       0.0407217        35804     2.34506e-18
> 1000  1000  1000     0.372414      5370.37       0.0557306      35886.9      0.00180032
> 1100  1100  1100     0.495172      5375.91       0.0759043      35070.5      0.00111911
> 1200  1200  1200     0.643628      5369.56       0.0936622      36898.5     7.13231e-19
> 1300  1300  1300     0.831523      5284.28        0.119804      36676.5     0.000485489
> 1400  1400  1400      1.07165      5121.07        0.148407      36979.5     0.000334856
> 1500  1500  1500      1.26523      5334.99        0.179347      37636.6     2.74426e-19
> 1600  1600  1600      1.57777      5192.13        0.224779      36444.7      0.00017181
> 1700  1700  1700      2.67335      3675.54        0.259066      37928.6     0.000126808
> 1800  1800  1800      3.35908      3472.38        0.305459      38185.1     1.22716e-19
> ^C
> 
> (C) Your code with the parameters you suggested.
> 
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DBS_D_MR=2 -DBS_D_NR=16 -DBS_D_KC=512 -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> #   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Res
>  100   100   100  0.000991205      2017.75     0.000371863      5378.32               0
>  200   200   200   0.00492544      3248.44     0.000893081      17915.5               0
>  300   300   300    0.0149147       3620.6      0.00288725      18702.9               0
>  400   400   400    0.0304561      4202.77      0.00533248      24003.9               0
>  500   500   500    0.0519717      4810.31       0.0094423      26476.6               0
>  600   600   600    0.0815292      5298.72       0.0160959      26839.2     8.50004e-18
>  700   700   700     0.127243      5391.25       0.0241644      28388.9     5.82176e-18
>  800   800   800     0.190188      5384.16       0.0379423      26988.3     3.67296e-18
>  900   900   900     0.270269      5394.62       0.0526334      27701.1     2.34234e-18
> 1000  1000  1000     0.369451      5413.44       0.0710608      28144.9     1.53537e-18
> 1100  1100  1100     0.491269      5418.62       0.0953915        27906     1.03025e-18
> 1200  1200  1200     0.638369       5413.8        0.119802      28847.5     7.13779e-19
> 1300  1300  1300     0.822215       5344.1         0.15489      28368.5     5.07376e-19
> 1400  1400  1400      1.03643      5295.12        0.191441      28666.8     3.69364e-19
> 1500  1500  1500      1.26753      5325.32        0.231101        29208     2.74895e-19
> 1600  1600  1600      1.60965      5089.31        0.289249      28321.6      2.0631e-19
> 1700  1700  1700      2.67171      3677.79        0.333099      29498.7     1.57713e-19
> 1800  1800  1800      3.34254      3489.56        0.397267      29360.6      1.2262e-19
> ^C
> [lehn_at_node042 session5]$
> 
> 
> 
> Cheers,
> 
> Michael
> 
> 
> On 29 Jan 2016, at 23:31, palik imre <imre_palik_at_[hidden]> wrote:
> 
>> Here is a more sane kernel using gcc simd vectors:
>> 
>> template <typename Index, typename T>
>> typename std::enable_if<std::is_floating_point<T>::value,
>>        void>::type
>> ugemm(Index kc, T alpha,
>>          const T *A, const T *B,
>>          T beta,
>>          T *C, Index incRowC, Index incColC)
>> {
>>   static const Index MR = BlockSize<T>::MR;
>>   static const Index NR = BlockSize<T>::NR;
>>   static const unsigned div = 32/sizeof(T);
>>   typedef T vx __attribute__((vector_size (32)));
>> 
>>   vx P[MR*NR/div] __attribute__ ((aligned (128))) = {};
>>   const vx *B_ = (vx *)B;
>>   for (Index l=0; l<kc; ++l) {
>>     for (Index j=0; j<(NR/div); ++j) {
>>       for (Index i=0; i<MR; ++i) {
>>         P[i * NR/div + j] += A[i + l * MR]*B_[l*(NR/div)+j];
>>       }
>>     }
>>   }
>> 
>>   T *P_ = (T *)P;
>>   for (Index i=0; i<MR; ++i) {
>>     for (Index j=0; j<NR; ++j) {
>>       C[i*incRowC+j*incColC] *= beta;
>>       C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
>>     }
>>   }
>> }
>> 
>> 
>> this plugs nicely to Michael's code, and slightly faster than my previous try (but still nowhere near to Michael's hand crafted kernel).
>> 
>> The good blocksize for double is MR=2, NR=16, KC=512
>> multiply MR and NR by 2 for float.
>> 
>> 
>> Cheers,
>> 
>> Imre
>> 
> 
> _______________________________________________
> ublas mailing list
> ublas_at_[hidden]
> http://listarchives.boost.org/mailman/listinfo.cgi/ublas
> Sent to: michael.lehn_at_[hidden]
>