Below is the C++ implementation comparing the time taken by Eigen and For Loop to perform matrix-matrix products. The For loop has been optimised to minimise cache misses. T
There is no need to mystifying how a high performance implementation of the matrix-matrix product can be achieved. In fact we need more people knowing about it, in order to face future challenges in high-performance computing. In order to get into this topic reading BLIS: A Framework for Rapidly Instantiating BLAS Functionality is a good starting point.
So in order to demystify and to answer the question (How to write a matrix matrix product that can compete with Eigen) I extended the code posted by ggael to a total of 400 lines. I just tested it on an AVX machine (Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz). Here some results:
g++-5.3 -O3 -DNDEBUG -std=c++11 -mavx -m64 -I ../eigen.3.2.8/ gemm.cc -lrt
lehn@heim:~/work/test_eigen$ ./a.out 500
Time taken by Eigen is: 0.0190425
Time taken by for-loop is: 0.0121688
lehn@heim:~/work/test_eigen$ ./a.out 1000
Time taken by Eigen is: 0.147991
Time taken by for-loop is: 0.0959097
lehn@heim:~/work/test_eigen$ ./a.out 1500
Time taken by Eigen is: 0.492858
Time taken by for-loop is: 0.322442
lehn@heim:~/work/test_eigen$ ./a.out 5000
Time taken by Eigen is: 18.3666
Time taken by for-loop is: 12.1023
If you have FMA you can compile with
g++-5.3 -O3 -DNDEBUG -std=c++11 -mfma -m64 -I ../eigen.3.2.8/ -DHAVE_FMA gemm.cc -lrt
If you also want multithreading with openMP also compile with -fopenmp
Here the complete code based on the ideas of the BLIS paper. It is self-contained except that it needs the complete Eigen source files as ggael already noted:
#include
#include
#include
#if defined(_OPENMP)
#include
#endif
//-- malloc with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
alignment = std::max(alignment, alignof(void *));
size += alignment;
void *ptr = std::malloc(size);
void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
void **vp = (void**) ptr2 - 1;
*vp = ptr;
return ptr2;
}
void
free_(void *ptr)
{
std::free(*((void**)ptr-1));
}
//-- Config --------------------------------------------------------------------
// SIMD-Register width in bits
// SSE: 128
// AVX/FMA: 256
// AVX-512: 512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif
#ifdef HAVE_FMA
# ifndef BS_D_MR
# define BS_D_MR 4
# endif
# ifndef BS_D_NR
# define BS_D_NR 12
# endif
# ifndef BS_D_MC
# define BS_D_MC 256
# endif
# ifndef BS_D_KC
# define BS_D_KC 512
# endif
# ifndef BS_D_NC
# define BS_D_NC 4092
# endif
#endif
#ifndef BS_D_MR
#define BS_D_MR 4
#endif
#ifndef BS_D_NR
#define BS_D_NR 8
#endif
#ifndef BS_D_MC
#define BS_D_MC 256
#endif
#ifndef BS_D_KC
#define BS_D_KC 256
#endif
#ifndef BS_D_NC
#define BS_D_NC 4096
#endif
template
struct BlockSize
{
static constexpr int MC = 64;
static constexpr int KC = 64;
static constexpr int NC = 256;
static constexpr int MR = 8;
static constexpr int NR = 8;
static constexpr int rwidth = 0;
static constexpr int align = alignof(T);
static constexpr int vlen = 0;
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};
template <>
struct BlockSize
{
static constexpr int MC = BS_D_MC;
static constexpr int KC = BS_D_KC;
static constexpr int NC = BS_D_NC;
static constexpr int MR = BS_D_MR;
static constexpr int NR = BS_D_NR;
static constexpr int rwidth = SIMD_REGISTER_WIDTH;
static constexpr int align = rwidth / 8;
static constexpr int vlen = rwidth / (8*sizeof(double));
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};
//-- aux routines --------------------------------------------------------------
template
void
geaxpy(Index m, Index n,
const Alpha &alpha,
const TX *X, Index incRowX, Index incColX,
TY *Y, Index incRowY, Index incColY)
{
for (Index j=0; j
void
gescal(Index m, Index n,
const Alpha &alpha,
TX *X, Index incRowX, Index incColX)
{
if (alpha!=Alpha(0)) {
for (Index j=0; j
typename std::enable_if::vlen != 0,
void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
T *C, Index incRowC, Index incColC)
{
typedef T vx __attribute__((vector_size (BlockSize::rwidth/8)));
static constexpr Index vlen = BlockSize::vlen;
static constexpr Index MR = BlockSize::MR;
static constexpr Index NR = BlockSize::NR/vlen;
A = (const T*) __builtin_assume_aligned (A, BlockSize::align);
B = (const T*) __builtin_assume_aligned (B, BlockSize::align);
vx P[MR*NR] = {};
for (Index l=0; l
void
mgemm(Index mc, Index nc, Index kc,
T alpha,
const T *A, const T *B,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
const Index MR = BlockSize::MR;
const Index NR = BlockSize::NR;
const Index mp = (mc+MR-1) / MR;
const Index np = (nc+NR-1) / NR;
const Index mr_ = mc % MR;
const Index nr_ = nc % NR;
T C_[MR*NR];
#pragma omp parallel for
for (Index j=0; j
void
pack_A(Index mc, Index kc,
const TA *A, Index incRowA, Index incColA,
T *p)
{
Index MR = BlockSize::MR;
Index mp = (mc+MR-1) / MR;
for (Index j=0; j
void
pack_B(Index kc, Index nc,
const TB *B, Index incRowB, Index incColB,
T *p)
{
Index NR = BlockSize::NR;
Index np = (nc+NR-1) / NR;
for (Index l=0; l
void
gemm(Index m, Index n, Index k,
Alpha alpha,
const TA *A, Index incRowA, Index incColA,
const TB *B, Index incRowB, Index incColB,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
typedef typename std::common_type::type T;
const Index MC = BlockSize::MC;
const Index NC = BlockSize::NC;
const Index MR = BlockSize::MR;
const Index NR = BlockSize::NR;
const Index KC = BlockSize::KC;
const Index mb = (m+MC-1) / MC;
const Index nb = (n+NC-1) / NC;
const Index kb = (k+KC-1) / KC;
const Index mc_ = m % MC;
const Index nc_ = n % NC;
const Index kc_ = k % KC;
T *A_ = (T*) malloc_(BlockSize::align, sizeof(T)*(MC*KC+MR));
T *B_ = (T*) malloc_(BlockSize::align, sizeof(T)*(KC*NC+NR));
if (alpha==Alpha(0) || k==0) {
gescal(m, n, beta, C, incRowC, incColC);
return;
}
for (Index j=0; j(1,10000000/N/N/N);
Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
Eigen::MatrixXd c_E(N,N);
Eigen::BenchTimer t1, t2;
BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
std::cout << "Time taken by Eigen is: " << t1.best() << "\n";
std::cout << "Time taken by for-loop is: " << t2.best() << "\n\n";
}