$include_dir="/home/hyper-archives/ublas/include"; include("$include_dir/msg-header.inc") ?>
Subject: Re: [ublas] Matrix multiplication performance
From: Michael Lehn (michael.lehn_at_[hidden])
Date: 2016-02-10 09:50:00
Hi Palik,
sorry for the late response.  Its the last week of the semester so it is pretty busy 
 It is pretty
cool that prefetching improves the results so significantly.  I will add this to the examples on my
page.
Today I added an example on how-to implement a fast LU factorization with ublas.  The
implementation still needs polishing but the algorithms are pretty good:
        http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session7/page01.html
Cheers,
Michael
On 03 Feb 2016, at 20:53, palik imre <imre_palik_at_[hidden]> wrote:
> Hi Michael,
> 
> first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz
> 
> I also realised, that we start to become memory bound.
> I am getting approximately 10% improvement  with the following setup on my old AMD box:
> 
> 
> template <typename Index, typename T>
> typename std::enable_if<std::is_floating_point<T>::value,
> void>::type
> ugemm(Index kc, T alpha,
>               const T *A, const T *B,
>     T beta,
>     T *C, Index incRowC, Index incColC)
> {
>   A = (const T*) __builtin_assume_aligned (A, 128);
>   B = (const T*) __builtin_assume_aligned (B, 128);
> 
>   static const unsigned div = 32/sizeof(T);
>   static const Index MR = BlockSize<T>::MR;
>   static const Index NR = BlockSize<T>::NR/div;
> 
>   typedef T vx __attribute__((vector_size (32)));
> 
>   vx P[MR*NR] __attribute__ ((aligned (128))) = {};
>   const vx *B_ = (const vx *)B;
>   for (Index l=0; l<kc; ++l) {
>     if (!(kc&3))
>           __builtin_prefetch(A + 16, 0);
>       for (Index i=0; i<MR; ++i) {
>           for (Index j=0; j<(NR); ++j) {
>               P[i * NR + j] += A[i]*B_[j];
>           }
>       }
>       A += MR;
>       B_ += NR;
>   }
> 
>   T *P_ = (T *)P;
>   for (Index j=0; j<NR*div; ++j) {
>       for (Index i=0; i<MR; ++i) {
>           C[i*incRowC+j*incColC] *= beta;
>           C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
>       }
>   }
> }
> 
> template <typename Index, typename T, typename Beta, typename TC>
> void
> mgemm(Index mc, Index nc, Index kc,
>      T alpha,
>      const T *A, const T *B,
>      Beta beta,
>      TC *C, Index incRowC, Index incColC)
> {
>    const Index MR = BlockSize<T>::MR;
>    const Index NR = BlockSize<T>::NR;
>    const Index mp  = (mc+MR-1) / MR;
>    const Index np  = (nc+NR-1) / NR;
>    const Index mr_ = mc % MR;
>    const Index nr_ = nc % NR;
> 
>    #if defined(_OPENMP)
>    #pragma omp parallel for
>    #endif
>    for (Index j=0; j<np; ++j) {
>        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
>        T C_[BlockSize<T>::MR*BlockSize<T>::NR];
>        __builtin_prefetch(B + j * kc * NR, 0);
>        __builtin_prefetch(B + j * kc * NR + 16, 0);
>        for (Index i=0; i<mp; ++i) {
>            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
>            __builtin_prefetch(A + i * kc * MR, 0);
>            if (mr==MR && nr==NR) {
>                ugemm(kc, alpha,
>                      &A[i*kc*MR], &B[j*kc*NR],
>                      beta,
>                      &C[i*MR*incRowC+j*NR*incColC],
>                      incRowC, incColC);
>            } else {
>                std::fill_n(C_, MR*NR, T(0));
>                ugemm(kc, alpha,
>                      &A[i*kc*MR], &B[j*kc*NR],
>                      T(0),
>                      C_, Index(1), MR);
>                gescal(mr, nr, beta,
>                       &C[i*MR*incRowC+j*NR*incColC],
>                       incRowC, incColC);
>                geaxpy(mr, nr, T(1), C_, Index(1), MR,
>                       &C[i*MR*incRowC+j*NR*incColC],
>                       incRowC, incColC);
>            }
>        }
>    }
> }
>