Fastest way to multiply an array of int64_t?

后端 未结 2 1895
天涯浪人
天涯浪人 2020-11-30 07:00

I want to vectorize the multiplication of two memory aligned arrays. I didn\'t find any way to multiply 64*64 bit in AVX/AVX2, so I just did loop-unroll and AVX2 loads/stor

2条回答
  •  感情败类
    2020-11-30 07:24

    If you're interested in SIMD 64bx64b to 64b (lower) operations here are the AVX and AVX2 solutions from Agner Fog's Vector Class Library. I would test these with arrays and see how it compares to what GCC does with a generic loop such as the one in Peter Cordes' answer.

    AVX (use SSE - you can still compile with -mavx to get vex encoding).

    // vector operator * : multiply element by element
    static inline Vec2q operator * (Vec2q const & a, Vec2q const & b) {
    #if INSTRSET >= 5   // SSE4.1 supported
        // instruction does not exist. Split into 32-bit multiplies
        __m128i bswap   = _mm_shuffle_epi32(b,0xB1);           // b0H,b0L,b1H,b1L (swap H<->L)
        __m128i prodlh  = _mm_mullo_epi32(a,bswap);            // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
        __m128i zero    = _mm_setzero_si128();                 // 0
        __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
        __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
        __m128i prodll  = _mm_mul_epu32(a,b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
        __m128i prod    = _mm_add_epi64(prodll,prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
        return  prod;
    #else               // SSE2
        int64_t aa[2], bb[2];
        a.store(aa);                                           // split into elements
        b.store(bb);
        return Vec2q(aa[0]*bb[0], aa[1]*bb[1]);                // multiply elements separetely
    #endif
    }
    

    AVX2

    // vector operator * : multiply element by element
    static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) {
        // instruction does not exist. Split into 32-bit multiplies
        __m256i bswap   = _mm256_shuffle_epi32(b,0xB1);           // swap H<->L
        __m256i prodlh  = _mm256_mullo_epi32(a,bswap);            // 32 bit L*H products
        __m256i zero    = _mm256_setzero_si256();                 // 0
        __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero);         // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
        __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73);     // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
        __m256i prodll  = _mm256_mul_epu32(a,b);                  // a0Lb0L,a1Lb1L, 64 bit unsigned products
        __m256i prod    = _mm256_add_epi64(prodll,prodlh3);       // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
        return  prod;
    }
    

    These functions work for signed and unsigned 64-bit integers. In your case since q is constant within the loop you don't need to recalculate some things every iteration but your compiler will probably figure that out anyway.

提交回复
热议问题