Ublas mailing page: Re: [ublas] Matrix multiplication performance

Date view	Thread view	Subject view	Author view

Subject: Re: [ublas] Matrix multiplication performance
From: Karl Rupp (rupp_at_[hidden])
Date: 2016-02-02 10:32:39

Next message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Previous message: palik imre: "Re: [ublas] Matrix multiplication performance"
In reply to: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Next in thread: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Hi guys,

I'm sorry to interrupt your performance tuning fun, but please note that
the kernel does not work correctly. To verify, consider the case that
beta is equal to zero and that C initially consists of NaNs (which may
happen in practice if you start if a freshly malloc'ed buffer)...

Best regards,
Karli

On 01/30/2016 05:48 PM, Michael Lehn wrote:
> Here the same in slightly more general form
>
> template <typename Index>
> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
> && BlockSize<double>::MR==4
> && BlockSize<double>::NR==12
> && BlockSize<double>::align==32,
> void>::type
> ugemm(Index kc, double alpha,
> const double *A, const double *B,
> double beta,
> double *C, Index incRowC, Index incColC)
> {
> static const Index MR = BlockSize<double>::MR;
> static const Index NR = BlockSize<double>::NR/4;
> typedef double vx __attribute__((vector_size (4*sizeof(double))));
>
> A = (const double*) __builtin_assume_aligned (A, 32);
> B = (const double*) __builtin_assume_aligned (B, 32);
>
> vx P[MR*NR] = {};
>
> for (Index l=0; l<kc; ++l) {
> const vx *b = (const vx *)B;
>
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<NR; ++j) {
> P[i*NR+j] += A[i]*b[j];
> }
> }
> A += MR;
> B += NR*4;
> }
>
> if (alpha!=double(1)) {
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<NR; ++j) {
> P[i*NR+j] *= alpha;
> }
> }
> }
>
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<NR; ++j) {
> const double *p = (const double *) &P[i*NR+j];
> for (Index j1=0; j1<4; ++j1) {
> C[i*incRowC+(j*4+j1)*incColC] *= beta;
> C[i*incRowC+(j*4+j1)*incColC] += p[j1];
> }
> }
> }
> }
>
> On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
>
>> Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
>>
>> //-- Micro Kernel --------------------------------------------------------------
>> template <typename Index>
>> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
>> && BlockSize<double>::MR==4
>> && BlockSize<double>::NR==12
>> && BlockSize<double>::align==32,
>> void>::type
>> ugemm(Index kc, double alpha,
>> const double *A, const double *B,
>> double beta,
>> double *C, Index incRowC, Index incColC)
>> {
>> static const Index MR = BlockSize<double>::MR;
>> static const Index NR = BlockSize<double>::NR;
>> typedef double vx __attribute__((vector_size (4*sizeof(double))));
>>
>> A = (const double*) __builtin_assume_aligned (A, 32);
>> B = (const double*) __builtin_assume_aligned (B, 32);
>>
>> vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
>> vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
>> vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
>> vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
>>
>> for (Index l=0; l<kc; ++l) {
>> const vx *b = (const vx *)B;
>>
>> P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
>> P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
>> P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
>> P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
>> A += MR;
>> B += NR;
>> }
>>
>> P0_03 *= alpha; P0_47 *= alpha; P0_811 *= alpha;
>> P1_03 *= alpha; P1_47 *= alpha; P1_811 *= alpha;
>> P2_03 *= alpha; P2_47 *= alpha; P2_811 *= alpha;
>> P3_03 *= alpha; P3_47 *= alpha; P3_811 *= alpha;
>>
>> if (beta!=double(1)) {
>> for (Index i=0; i<MR; ++i) {
>> for (Index j=0; j<NR; ++j) {
>> C[i*incRowC+j*incColC] *= beta;
>> }
>> }
>> }
>>
>> const double *p = (const double *) &P0_03;
>> C[0*incRowC+0*incColC] += p[0];
>> C[0*incRowC+1*incColC] += p[1];
>> C[0*incRowC+2*incColC] += p[2];
>> C[0*incRowC+3*incColC] += p[3];
>>
>> p = (const double *) &P0_47;
>> C[0*incRowC+4*incColC] += p[0];
>> C[0*incRowC+5*incColC] += p[1];
>> C[0*incRowC+6*incColC] += p[2];
>> C[0*incRowC+7*incColC] += p[3];
>>
>> p = (const double *) &P0_811;
>> C[0*incRowC+8*incColC] += p[0];
>> C[0*incRowC+9*incColC] += p[1];
>> C[0*incRowC+10*incColC] += p[2];
>> C[0*incRowC+11*incColC] += p[3];
>>
>> p = (const double *) &P1_03;
>> C[1*incRowC+0*incColC] += p[0];
>> C[1*incRowC+1*incColC] += p[1];
>> C[1*incRowC+2*incColC] += p[2];
>> C[1*incRowC+3*incColC] += p[3];
>>
>> p = (const double *) &P1_47;
>> C[1*incRowC+4*incColC] += p[0];
>> C[1*incRowC+5*incColC] += p[1];
>> C[1*incRowC+6*incColC] += p[2];
>> C[1*incRowC+7*incColC] += p[3];
>>
>> p = (const double *) &P1_811;
>> C[1*incRowC+8*incColC] += p[0];
>> C[1*incRowC+9*incColC] += p[1];
>> C[1*incRowC+10*incColC] += p[2];
>> C[1*incRowC+11*incColC] += p[3];
>>
>> p = (const double *) &P2_03;
>> C[2*incRowC+0*incColC] += p[0];
>> C[2*incRowC+1*incColC] += p[1];
>> C[2*incRowC+2*incColC] += p[2];
>> C[2*incRowC+3*incColC] += p[3];
>>
>> p = (const double *) &P2_47;
>> C[2*incRowC+4*incColC] += p[0];
>> C[2*incRowC+5*incColC] += p[1];
>> C[2*incRowC+6*incColC] += p[2];
>> C[2*incRowC+7*incColC] += p[3];
>>
>> p = (const double *) &P2_811;
>> C[2*incRowC+8*incColC] += p[0];
>> C[2*incRowC+9*incColC] += p[1];
>> C[2*incRowC+10*incColC] += p[2];
>> C[2*incRowC+11*incColC] += p[3];
>>
>> p = (const double *) &P3_03;
>> C[3*incRowC+0*incColC] += p[0];
>> C[3*incRowC+1*incColC] += p[1];
>> C[3*incRowC+2*incColC] += p[2];
>> C[3*incRowC+3*incColC] += p[3];
>>
>> p = (const double *) &P3_47;
>> C[3*incRowC+4*incColC] += p[0];
>> C[3*incRowC+5*incColC] += p[1];
>> C[3*incRowC+6*incColC] += p[2];
>> C[3*incRowC+7*incColC] += p[3];
>>
>> p = (const double *) &P3_811;
>> C[3*incRowC+8*incColC] += p[0];
>> C[3*incRowC+9*incColC] += p[1];
>> C[3*incRowC+10*incColC] += p[2];
>> C[3*incRowC+11*incColC] += p[3];
>>
>> }
>
> _______________________________________________
> ublas mailing list
> ublas_at_[hidden]
> http://listarchives.boost.org/mailman/listinfo.cgi/ublas
> Sent to: rupp_at_[hidden]
>

Next message: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Previous message: palik imre: "Re: [ublas] Matrix multiplication performance"
In reply to: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Next in thread: Michael Lehn: "Re: [ublas] Matrix multiplication performance"
Reply: Michael Lehn: "Re: [ublas] Matrix multiplication performance"

Date view	Thread view	Subject view	Author view