Can't get over 50% max. theoretical performance on matrix multiply

Problem

I am learning about HPC and code optimization. I attempt to replicate the results in Goto's seminal matrix multiplication paper (http://www.cs.utexas.edu/users/pingali/CS378/2008sp/papers/gotoPaper.pdf). Despite my best efforts, I cannot get over ~50% maximum theoretical CPU performance.

Background

See related issues here (Optimized 2x2 matrix multiplication: Slow assembly versus fast SIMD), including info about my hardware

What I have attempted

This related paper (http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf) has a good description of Goto's algorithmic structure. I provide my source code below.

My question

I am asking for general help. I have been working on this for far too long, have tried many different algorithms, inline assembly, inner kernels of various sizes (2x2, 4x4, 2x8, ..., mxn with m and n large), yet I cannot seem to break 50% CPU Gflops. This is purely for education purposes and not a homework.

Source Code

Hopefully is understandable. Please ask if not. I set up the macro structure (for loops) as described in the 2nd paper above. I pack the matrices as discussed in either paper and shown graphically in Figure 11 here (http://www.cs.utexas.edu/users/flame/pubs/BLISTOMSrev2.pdf). My inner kernel computes 2x8 blocks, as this seems to be the optimal computation for Nehalem architecture (see GotoBLAS source code - kernels). The inner kernel is based on the concept of calculating rank-1 updates as described here (http://code.google.com/p/blis/source/browse/config/template/kernels/3/bli_gemm_opt_mxn.c)

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>


// define some prefetch functions
#define PREFETCHNTA(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_NTA)

#define PREFETCHT0(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T0)

#define PREFETCHT1(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T1)

#define PREFETCHT2(addr,nrOfBytesAhead) \
        _mm_prefetch(((char *)(addr))+nrOfBytesAhead,_MM_HINT_T2)

// define a min function
#ifndef min
    #define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif

// zero a matrix
void zeromat(double *C, int n)
{
    int i = n;
    while (i--) {
        int j = n;
        while (j--) {
            *(C + i*n + j) = 0.0;
        }
    }
}

// compute a 2x8 block from (2 x kc) x (kc x 8) matrices
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) dgemm_2x8_sse(
                int k,
                const double* restrict a1, const int cs_a,
                const double* restrict b1, const int rs_b,
                      double* restrict c11, const int rs_c
                )
{

    register __m128d xmm1, xmm4, //
                    r8, r9, r10, r11, r12, r13, r14, r15; // accumulators

    // 10 registers declared here

    r8 = _mm_xor_pd(r8,r8); // ab
    r9 = _mm_xor_pd(r9,r9);
    r10 = _mm_xor_pd(r10,r10);
    r11 = _mm_xor_pd(r11,r11);

    r12 = _mm_xor_pd(r12,r12); // ab + 8
    r13 = _mm_xor_pd(r13,r13);
    r14 = _mm_xor_pd(r14,r14);
    r15 = _mm_xor_pd(r15,r15);

        // PREFETCHT2(b1,0);
        // PREFETCHT2(b1,64);



    //int l = k;
    while (k--) {

        //PREFETCHT0(a1,0); // fetch 64 bytes from a1

            // i = 0
            xmm1 = _mm_load1_pd(a1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r8 = _mm_add_pd(r8,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r9 = _mm_add_pd(r9,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r10 = _mm_add_pd(r10,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r11 = _mm_add_pd(r11,xmm4);

            //
            // i = 1
            xmm1 = _mm_load1_pd(a1 + 1);

            xmm4 = _mm_load_pd(b1);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r12 = _mm_add_pd(r12,xmm4);

            xmm4 = _mm_load_pd(b1 + 2);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r13 = _mm_add_pd(r13,xmm4);

            xmm4 = _mm_load_pd(b1 + 4);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r14 = _mm_add_pd(r14,xmm4);

            xmm4 = _mm_load_pd(b1 + 6);
            xmm4 = _mm_mul_pd(xmm1,xmm4);
            r15 = _mm_add_pd(r15,xmm4);

        a1 += cs_a;
        b1 += rs_b;

        //PREFETCHT2(b1,0);
        //PREFETCHT2(b1,64);

    }

        // copy result into C

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r8);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r9);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r10);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r11);
        _mm_store_pd(c11 + 6,xmm1);

        c11 += rs_c;

        PREFETCHT0(c11,0);
        xmm1 = _mm_load_pd(c11);
        xmm1 = _mm_add_pd(xmm1,r12);
        _mm_store_pd(c11,xmm1);

        xmm1 = _mm_load_pd(c11 + 2);
        xmm1 = _mm_add_pd(xmm1,r13);
        _mm_store_pd(c11 + 2,xmm1);

        xmm1 = _mm_load_pd(c11 + 4);
        xmm1 = _mm_add_pd(xmm1,r14);
        _mm_store_pd(c11 + 4,xmm1);

        xmm1 = _mm_load_pd(c11 + 6);
        xmm1 = _mm_add_pd(xmm1,r15);
        _mm_store_pd(c11 + 6,xmm1);

}

// packs a matrix into rows of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64))) rpack(        double* restrict dst, 
          const double* restrict src, 
            const int kc, const int mc, const int mr, const int n)
{
    double tmp[mc*kc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < mc; ++i)
        for (int j = 0; j < kc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    //const int inc_dst = mr*kc;
    for (int k = 0; k < mc; k+=mr)
        for (int j = 0; j < kc; ++j)
            for (int i = 0; i < mr*kc; i+=kc)
                *dst++ = *(ptr + k*kc + j + i);

}

// packs a matrix into columns of slivers
inline void 
__attribute__ ((gnu_inline))        
__attribute__ ((aligned(64)))  cpack(double* restrict dst, 
                const double* restrict src, 
                const int nc, 
                const int kc, 
                const int nr, 
                const int n)
{
    double tmp[kc*nc] __attribute__ ((aligned(64)));
    double* restrict ptr = &tmp[0];

    for (int i = 0; i < kc; ++i)
        for (int j = 0; j < nc; ++j)
            *ptr++ = *(src + i*n + j);

    ptr = &tmp[0];

    // const int inc_k = nc/nr;
    for (int k = 0; k < nc; k+=nr)
        for (int j = 0; j < kc*nc; j+=nc)
            for (int i = 0; i < nr; ++i)
                *dst++ = *(ptr + k + i + j);

}

void blis_dgemm_ref(
        const int n,
        const double* restrict A,
        const double* restrict B,
        double* restrict C,
        const int mc,
        const int nc,
        const int kc
    )
{
    int mr = 2;
    int nr = 8;
    double locA[mc*kc] __attribute__ ((aligned(64)));
    double locB[kc*nc] __attribute__ ((aligned(64)));
    int ii,jj,kk,i,j;
    #pragma omp parallel num_threads(4) shared(A,B,C) private(ii,jj,kk,i,j,locA,locB)
    {//use all threads in parallel
        #pragma omp for
        // partitions C and B into wide column panels
        for ( jj = 0; jj < n; jj+=nc) {
        // A and the current column of B are partitioned into col and row panels
            for ( kk = 0; kk < n; kk+=kc) {
                cpack(locB, B + kk*n + jj, nc, kc, nr, n);
                // partition current panel of A into blocks
                for ( ii = 0; ii < n; ii+=mc) {
                    rpack(locA, A + ii*n + kk, kc, mc, mr, n);
                    for ( i = 0; i < min(n-ii,mc); i+=mr) {
                        for ( j = 0; j < min(n-jj,nc); j+=nr) {
                            // inner kernel that compues 2 x 8 block
                            dgemm_2x8_sse( kc,
                                       locA + i*kc          ,  mr,
                                       locB + j*kc          ,  nr,
                                       C + (i+ii)*n + (j+jj),  n );
                        }
                    }
                }
            }
        }
    }
}

double compute_gflops(const double time, const int n)
{
    // computes the gigaflops for a square matrix-matrix multiplication
    double gflops;
    gflops = (double) (2.0*n*n*n)/time/1.0e9;
    return(gflops);
}

// ******* MAIN ********//
void main() {
    clock_t time1, time2;
    double time3;
    double gflops;
    const int trials = 10;

    int nmax = 4096;
    printf("%10s %10s\n","N","Gflops/s");

    int mc = 128;
    int kc = 256;
    int nc = 128;

    for (int n = kc; n <= nmax; n+=kc) { //assuming kc is the max dim
        double *A = NULL;
        double *B = NULL;
        double *C = NULL;

        A = _mm_malloc (n*n * sizeof(*A),64);
        B = _mm_malloc (n*n * sizeof(*B),64);
        C = _mm_malloc (n*n * sizeof(*C),64);

        srand(time(NULL));

        // Create the matrices
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                A[i*n + j] = (double) rand()/RAND_MAX;
                B[i*n + j] = (double) rand()/RAND_MAX;
                //D[j*n + i] = B[i*n + j]; // Transpose
                C[i*n + j] = 0.0;
            }
        }

            // warmup
            zeromat(C,n);
            blis_dgemm_ref(n,A,B,C,mc,nc,kc);
            zeromat(C,n);
            time2 = 0;
            for (int count = 0; count < trials; count++){// iterations per experiment here
                    time1 = clock();
                    blis_dgemm_ref(n,A,B,C,mc,nc,kc);
                    time2 += clock() - time1;
                    zeromat(C,n);
                }
            time3 = (double)(time2)/CLOCKS_PER_SEC/trials;
            gflops = compute_gflops(time3, n);
            printf("%10d %10f\n",n,gflops);

        _mm_free(A);
        _mm_free(B);
        _mm_free(C);

        }

    printf("tests are done\n");
}

EDIT 1

OS = Win 7 64 bit

Compiler = gcc 4.8.1, but 32 bit and mingw (32 bit also. I am working to get a "non-install" version of mingw64 so I can generate faster code/work with more XMM registers, etc. If anyone has a link to a mingw64 installation that is similar in fashion to mingw-get please post. My work computer has way too many admin restrictions.

Z boson

Packing

You appear to be packing the block of the A matrix too often. You do

rpack(locA, A + ii*n + kk, kc, mc, mr, n);

But this only depends on ii and kk and not on jj but it's inside the inner loop on jj so you repack the same thing for each iteration of jj. I don't think that's necessary. In my code I do the packing before the matrix multiplication. Probably it's more efficient to pack inside the the matrix multiplication while the values are still in the cache but it's trickier to do that. But packing is a O(n^2) operation and matrix multiplication is a O(n^3) operation so it's not very inefficient to pack outside of the matrix multiplication for large matrices (I know that from testing as well - commenting out the packing only changes the efficiency by a few percent). However, by repacking with rpack each jj iteration you have effectively made it an O(n^3) operation.

Wall Time

You want the wall time. On Unix the clock() function does not return the wall time (though it does on Windows with MSVC). It returns the cumulative time for each thread. This is one of the most common errors I have seen on SO for OpenMP.

Use omp_get_wtime() to get the wall time.

Note that I don't know how the clock() function works with MinGW or MinGW-w64 (they are separate projects). MinGW links to MSVCRT so I would guess that clock() with MinGW returns the wall time as it does with MSVC. However, MinGW-w64 does not link to MSVCRT (as far as I understand it links to something like glibc). It's possible that clock() in MinGW-w64 performs the same as clock() does with Unix.

Hyper Threading

Hyper threading works well for code that stalls the CPU often. That's actually the majority of code because it's very difficult to write code that does not stall the CPU. That's why Intel invented Hyper Threading. It's easier to task switch and give the CPU something else to do than to optimize the code. However, for code that is highly optimized hyper-threading can actually give worse results. In my own matrix multiplication code that's certainly the case. Set the number of threads to the number of physical cores you have (two in your case).

My Code

Below is my code. I did not include the inner64 function here. You can find it at Difference in performance between MSVC and GCC for highly optimized matrix multplication code (with the obnoxious and misleading name of AddDot4x4_vec_block_8wide)

I wrote this code before reading the Goto paper and also before reading Agner Fog's optimization manuals. You appear to reorder/pack the matrices in the main loop. That probably makes more sense. I don't think I reorder them the same way you do and also I only reorder one of the input matrices (B) and not both as you do.

The performance of this code on my system (Xeon E5-1620@3.6) with Linux and GCC is about 75% of the peak for this matrix size (4096x4096). Intel's MKL get's about 94% of the peak on my system for this matrix size so there is clearly room for improvement.

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <immintrin.h>

extern "C" void inner64(const float *a, const float *b, float *c);
void (*fp)(const float *a, const float *b, float *c) = inner64;

void reorder(float * __restrict a, float * __restrict b, int n, int bs) {
    int nb = n/bs;
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int i2=0; i2<bs; i2++) {
                for(int j2=0; j2<bs; j2++) {
                    b[bs*bs*(nb*i+j) + bs*i2+j2]= a[bs*(i*n+j) + i2*n + j2];    
                }
            }
        }
    }
}

inline void gemm_block(float * __restrict a, float * __restrict b, float * __restrict c, int n, int n2) {
    for(int i=0; i<n2; i++) {
        fp(&a[i*n], b, &c[i*n]);
    }
}

void gemm(float * __restrict a, float * __restrict b, float * __restrict c, int n, int bs) {
    int nb = n/bs;
    float *b2 = (float*)_mm_malloc(sizeof(float)*n*n,64);
    reorder(b,b2,n,bs);
    #pragma omp parallel for
    for(int i=0; i<nb; i++) {
        for(int j=0; j<nb; j++) {
            for(int k=0; k<nb; k++) {
                gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
            }
        }
    }
    _mm_free(b2);
}

int main() {
    float peak = 1.0f*8*4*2*3.69f;
    const int n = 4096;
    float flop = 2.0f*n*n*n*1E-9f;
    omp_set_num_threads(4);

    float *a = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *b = (float*)_mm_malloc(sizeof(float)*n*n,64);
    float *c = (float*)_mm_malloc(sizeof(float)*n*n,64);
    for(int i=0; i<n*n; i++) {
        a[i] = 1.0f*rand()/RAND_MAX;
        b[i] = 1.0f*rand()/RAND_MAX;
    }

    gemm(a,b,c,n,64); //warm OpenMP up
    while(1) {
        for(int i=0; i<n*n; i++) c[i] = 0;
        double dtime = omp_get_wtime();
        gemm(a,b,c,n,64);   
        dtime = omp_get_wtime() - dtime;
        printf("time %.2f s, efficiency %.2f%%\n", dtime, 100*flop/dtime/peak);
    }
}

来源：https://stackoverflow.com/questions/23964334/cant-get-over-50-max-theoretical-performance-on-matrix-multiply

标签

optimization

matrix

openmp

sse