How to write a matrix matrix product that can compete with Eigen?

前端 未结 3 985
伪装坚强ぢ
伪装坚强ぢ 2020-12-24 03:31

Below is the C++ implementation comparing the time taken by Eigen and For Loop to perform matrix-matrix products. The For loop has been optimised to minimise cache misses. T

3条回答
  •  青春惊慌失措
    2020-12-24 04:34

    There is no need to mystifying how a high performance implementation of the matrix-matrix product can be achieved. In fact we need more people knowing about it, in order to face future challenges in high-performance computing. In order to get into this topic reading BLIS: A Framework for Rapidly Instantiating BLAS Functionality is a good starting point.

    So in order to demystify and to answer the question (How to write a matrix matrix product that can compete with Eigen) I extended the code posted by ggael to a total of 400 lines. I just tested it on an AVX machine (Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz). Here some results:

    g++-5.3 -O3 -DNDEBUG -std=c++11 -mavx -m64 -I ../eigen.3.2.8/ gemm.cc -lrt
    
    lehn@heim:~/work/test_eigen$ ./a.out 500
    Time taken by Eigen is: 0.0190425
    Time taken by for-loop is: 0.0121688
    
    lehn@heim:~/work/test_eigen$ ./a.out 1000
    Time taken by Eigen is: 0.147991
    Time taken by for-loop is: 0.0959097
    
    lehn@heim:~/work/test_eigen$ ./a.out 1500
    Time taken by Eigen is: 0.492858
    Time taken by for-loop is: 0.322442
    
    lehn@heim:~/work/test_eigen$ ./a.out 5000
    Time taken by Eigen is: 18.3666
    Time taken by for-loop is: 12.1023
    

    If you have FMA you can compile with

    g++-5.3 -O3 -DNDEBUG -std=c++11 -mfma -m64 -I ../eigen.3.2.8/ -DHAVE_FMA gemm.cc -lrt
    

    If you also want multithreading with openMP also compile with -fopenmp

    Here the complete code based on the ideas of the BLIS paper. It is self-contained except that it needs the complete Eigen source files as ggael already noted:

    #include
    #include
    #include
    #if defined(_OPENMP)
    #include 
    #endif
    //-- malloc with alignment --------------------------------------------------------
    void *
    malloc_(std::size_t alignment, std::size_t size)
    {
        alignment = std::max(alignment, alignof(void *));
        size     += alignment;
    
        void *ptr  = std::malloc(size);
        void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
        void **vp  = (void**) ptr2 - 1;
        *vp        = ptr;
        return ptr2;
    }
    
    void
    free_(void *ptr)
    {
        std::free(*((void**)ptr-1));
    }
    
    //-- Config --------------------------------------------------------------------
    
    // SIMD-Register width in bits
    // SSE:         128
    // AVX/FMA:     256
    // AVX-512:     512
    #ifndef SIMD_REGISTER_WIDTH
    #define SIMD_REGISTER_WIDTH 256
    #endif
    
    #ifdef HAVE_FMA
    
    #   ifndef BS_D_MR
    #   define BS_D_MR 4
    #   endif
    
    #   ifndef BS_D_NR
    #   define BS_D_NR 12
    #   endif
    
    #   ifndef BS_D_MC
    #   define BS_D_MC 256
    #   endif
    
    #   ifndef BS_D_KC
    #   define BS_D_KC 512
    #   endif
    
    #   ifndef BS_D_NC
    #   define BS_D_NC 4092
    #   endif
    
    #endif
    
    
    
    #ifndef BS_D_MR
    #define BS_D_MR 4
    #endif
    
    #ifndef BS_D_NR
    #define BS_D_NR 8
    #endif
    
    #ifndef BS_D_MC
    #define BS_D_MC 256
    #endif
    
    #ifndef BS_D_KC
    #define BS_D_KC 256
    #endif
    
    #ifndef BS_D_NC
    #define BS_D_NC 4096
    #endif
    
    template 
    struct BlockSize
    {
        static constexpr int MC = 64;
        static constexpr int KC = 64;
        static constexpr int NC = 256;
        static constexpr int MR = 8;
        static constexpr int NR = 8;
    
        static constexpr int rwidth = 0;
        static constexpr int align  = alignof(T);
        static constexpr int vlen   = 0;
    
        static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
        static_assert(MC % MR == 0, "MC must be a multiple of MR.");
        static_assert(NC % NR == 0, "NC must be a multiple of NR.");
    };
    
    
    template <>
    struct BlockSize
    {
        static constexpr int MC     = BS_D_MC;
        static constexpr int KC     = BS_D_KC;
        static constexpr int NC     = BS_D_NC;
        static constexpr int MR     = BS_D_MR;
        static constexpr int NR     = BS_D_NR;
    
        static constexpr int rwidth = SIMD_REGISTER_WIDTH;
        static constexpr int align  = rwidth / 8;
        static constexpr int vlen   = rwidth / (8*sizeof(double));
    
        static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
        static_assert(MC % MR == 0, "MC must be a multiple of MR.");
        static_assert(NC % NR == 0, "NC must be a multiple of NR.");
        static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
    };
    
    //-- aux routines --------------------------------------------------------------
    template 
    void
    geaxpy(Index m, Index n,
           const Alpha &alpha,
           const TX *X, Index incRowX, Index incColX,
           TY       *Y, Index incRowY, Index incColY)
    {
        for (Index j=0; j
    void
    gescal(Index m, Index n,
           const Alpha &alpha,
           TX *X, Index incRowX, Index incColX)
    {
        if (alpha!=Alpha(0)) {
            for (Index j=0; j
    typename std::enable_if::vlen != 0,
             void>::type
    ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
          T *C, Index incRowC, Index incColC)
    {
        typedef T vx __attribute__((vector_size (BlockSize::rwidth/8)));
    
        static constexpr Index vlen = BlockSize::vlen;
        static constexpr Index MR   = BlockSize::MR;
        static constexpr Index NR   = BlockSize::NR/vlen;
    
        A = (const T*) __builtin_assume_aligned (A, BlockSize::align);
        B = (const T*) __builtin_assume_aligned (B, BlockSize::align);
    
        vx P[MR*NR] = {};
    
        for (Index l=0; l
    void
    mgemm(Index mc, Index nc, Index kc,
          T alpha,
          const T *A, const T *B,
          Beta beta,
          TC *C, Index incRowC, Index incColC)
    {
        const Index MR = BlockSize::MR;
        const Index NR = BlockSize::NR;
        const Index mp  = (mc+MR-1) / MR;
        const Index np  = (nc+NR-1) / NR;
        const Index mr_ = mc % MR;
        const Index nr_ = nc % NR;
    
        T C_[MR*NR];
    
        #pragma omp parallel for
        for (Index j=0; j
    void
    pack_A(Index mc, Index kc,
           const TA *A, Index incRowA, Index incColA,
           T *p)
    {
        Index MR = BlockSize::MR;
        Index mp = (mc+MR-1) / MR;
    
        for (Index j=0; j
    void
    pack_B(Index kc, Index nc,
           const TB *B, Index incRowB, Index incColB,
           T *p)
    {
        Index NR = BlockSize::NR;
        Index np = (nc+NR-1) / NR;
    
        for (Index l=0; l
    void
    gemm(Index m, Index n, Index k,
         Alpha alpha,
         const TA *A, Index incRowA, Index incColA,
         const TB *B, Index incRowB, Index incColB,
         Beta beta,
         TC *C, Index incRowC, Index incColC)
    {
        typedef typename std::common_type::type  T;
    
        const Index MC = BlockSize::MC;
        const Index NC = BlockSize::NC;
        const Index MR = BlockSize::MR;
        const Index NR = BlockSize::NR;
    
        const Index KC = BlockSize::KC;
        const Index mb = (m+MC-1) / MC;
        const Index nb = (n+NC-1) / NC;
        const Index kb = (k+KC-1) / KC;
        const Index mc_ = m % MC;
        const Index nc_ = n % NC;
        const Index kc_ = k % KC;
    
        T *A_ = (T*) malloc_(BlockSize::align, sizeof(T)*(MC*KC+MR));
        T *B_ = (T*) malloc_(BlockSize::align, sizeof(T)*(KC*NC+NR));
    
        if (alpha==Alpha(0) || k==0) {
            gescal(m, n, beta, C, incRowC, incColC);
            return;
        }
    
        for (Index j=0; j(1,10000000/N/N/N);
    
      Eigen::MatrixXd a_E = Eigen::MatrixXd::Random(N,N);
      Eigen::MatrixXd b_E = Eigen::MatrixXd::Random(N,N);
      Eigen::MatrixXd c_E(N,N);
    
      Eigen::BenchTimer t1, t2;
    
      BENCH(t1, tries, rep, c_E.noalias() = a_E*b_E );
      BENCH(t2, tries, rep, myprod(c_E.data(), a_E.data(), b_E.data(), N));
    
      std::cout << "Time taken by Eigen is: " << t1.best() << "\n";
      std::cout << "Time taken by for-loop is: " << t2.best() << "\n\n";
    }
    

提交回复
热议问题