What's the fastest stride-3 gather instruction sequence?

后端 未结 2 1521
-上瘾入骨i
-上瘾入骨i 2020-12-09 19:03

The question:

What is the most efficient sequence to generate a stride-3 gather of 32-bit elements from memory? If the memory is arranged as:

MEM =         


        
2条回答
  •  执笔经年
    2020-12-09 19:25

    The 8-bit integer case.

    As already mentioned in the comments above, two input shuffle instructions, such as vshufps, don't exist for 8-bit granularity. Hence, the 8-bit solution differs a bit from the 32-bit solution. Two different solutions are described below.


    A straightforward approach is to group the 8-bit integers 'color by color (R G B)' with 6 vpblendvb-s, followed by a vpshufb permutation:

    #include 
    #include 
    /*  gcc -O3 -Wall -m64 -march=broadwell stride_3.c   */
    int __attribute__ ((noinline)) print_vec_char(__m256i x);
    
    int main() {
    
        char *m;
        int i;
        __m256i blnd1 = _mm256_set_epi8(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,     0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0);
        __m256i blnd2 = _mm256_set_epi8(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,     0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0);
        __m256i p0    = _mm256_set_epi8(13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0,    13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0);
        __m256i p1    = _mm256_set_epi8(14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1,    14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1);
        __m256i p2    = _mm256_set_epi8(15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2,    15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2);
                m     = _mm_malloc(96,32);
        for(i = 0; i < 96; i++) m[i] = i;
    
    //        printf("m_lo  ");print_vec_char(_mm256_load_si256((__m256i*)&m[0]));printf("m_mid ");print_vec_char(_mm256_load_si256((__m256i*)&m[32]));printf("m_hi  ");print_vec_char(_mm256_load_si256((__m256i*)&m[64]));printf("\n");
    //        m_lo   31  30  29  28 |  27  26  25  24 |  23  22  21  20 |  19  18  17  16 ||  15  14  13  12 |  11  10   9   8 |   7   6   5   4 |   3   2   1   0 
    //        m_mid  63  62  61  60 |  59  58  57  56 |  55  54  53  52 |  51  50  49  48 ||  47  46  45  44 |  43  42  41  40 |  39  38  37  36 |  35  34  33  32 
    //        m_hi   95  94  93  92 |  91  90  89  88 |  87  86  85  84 |  83  82  81  80 ||  79  78  77  76 |  75  74  73  72 |  71  70  69  68 |  67  66  65  64 
    
        __m256i t0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[0]));
        __m256i t1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[16]));
        __m256i t2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[32]));
    
                t0 = _mm256_inserti128_si256(t0,_mm_loadu_si128((__m128i*)&m[48]),1);
                t1 = _mm256_inserti128_si256(t1,_mm_loadu_si128((__m128i*)&m[64]),1);
                t2 = _mm256_inserti128_si256(t2,_mm_loadu_si128((__m128i*)&m[80]),1);
    
    //        printf("t0  ");print_vec_char(t0);printf("t1  ");print_vec_char(t1);printf("t2  ");print_vec_char(t2);printf("\n");
    //        t0   63  62  61  60 |  59  58  57  56 |  55  54  53  52 |  51  50  49  48 ||  15  14  13  12 |  11  10   9   8 |   7   6   5   4 |   3   2   1   0 
    //        t1   79  78  77  76 |  75  74  73  72 |  71  70  69  68 |  67  66  65  64 ||  31  30  29  28 |  27  26  25  24 |  23  22  21  20 |  19  18  17  16 
    //        t2   95  94  93  92 |  91  90  89  88 |  87  86  85  84 |  83  82  81  80 ||  47  46  45  44 |  43  42  41  40 |  39  38  37  36 |  35  34  33  32 
    
        __m256i u0 = _mm256_blendv_epi8( _mm256_blendv_epi8(t0,t1,blnd2), t2,blnd1);
        __m256i u1 = _mm256_blendv_epi8( _mm256_blendv_epi8(t1,t2,blnd2), t0,blnd1);
        __m256i u2 = _mm256_blendv_epi8( _mm256_blendv_epi8(t2,t0,blnd2), t1,blnd1);
    
    //        printf("u0  ");print_vec_char(u0);printf("u1  ");print_vec_char(u1);printf("u2  ");print_vec_char(u2);printf("\n");
    //        u0   63  78  93  60 |  75  90  57  72 |  87  54  69  84 |  51  66  81  48 ||  15  30  45  12 |  27  42   9  24 |  39   6  21  36 |   3  18  33   0 
    //        u1   79  94  61  76 |  91  58  73  88 |  55  70  85  52 |  67  82  49  64 ||  31  46  13  28 |  43  10  25  40 |   7  22  37   4 |  19  34   1  16 
    //        u2   95  62  77  92 |  59  74  89  56 |  71  86  53  68 |  83  50  65  80 ||  47  14  29  44 |  11  26  41   8 |  23  38   5  20 |  35   2  17  32 
    
                t0 = _mm256_shuffle_epi8(u0,p0);
                t1 = _mm256_shuffle_epi8(u1,p1);
                t2 = _mm256_shuffle_epi8(u2,p2);
    
              printf("t0  ");print_vec_char(t0);printf("t1  ");print_vec_char(t1);printf("t2  ");print_vec_char(t2);printf("\n");
    //        t0   93  90  87  84 |  81  78  75  72 |  69  66  63  60 |  57  54  51  48 ||  45  42  39  36 |  33  30  27  24 |  21  18  15  12 |   9   6   3   0 
    //        t1   94  91  88  85 |  82  79  76  73 |  70  67  64  61 |  58  55  52  49 ||  46  43  40  37 |  34  31  28  25 |  22  19  16  13 |  10   7   4   1 
    //        t2   95  92  89  86 |  83  80  77  74 |  71  68  65  62 |  59  56  53  50 ||  47  44  41  38 |  35  32  29  26 |  23  20  17  14 |  11   8   5   2 
    
        return 0;
    }
    
    
    
    int __attribute__ ((noinline)) print_vec_char(__m256i x){
        char v[32];
        _mm256_storeu_si256((__m256i *)v,x);
        printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi || ",
               v[31],v[30],v[29],v[28],v[27],v[26],v[25],v[24],v[23],v[22],v[21],v[20],v[19],v[18],v[17],v[16]);
        printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi \n",
               v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
        return 0;
    }
    

    Instruction summary:

        3 vmovdqu
        3 vinserti128-load
        6 vpblendvb   
        3 vpshufb
    


    Unfortunately, the vpblendvb instruction is often relatively slow: on Intel Skylake vpblendvb has a throughput of one per cycle and on AMD Ryzen and Intel Haswell the throughput is only one per two cylcles. Skylake-X has a fast byte blend vpblendmb(throughput three per cycle (256-bit) ), although on Skylake-X one might be more interested in a solution that works with 512-bit vectors instead of 256-bit.


    An alternative is to combine vpshufb with vshufps, as suggested in @Peter Cordes' comments above. In the code below the data is loaded as 12-byte chunks. Altogether more instructions are needed than in the first solution. Nevertheless, the performance of this second solution is probably better than the first solution, depending on the surrounding code and the micro architecture.

    #include 
    #include 
    /*  gcc -O3 -Wall -m64 -march=broadwell stride_3.c   */
    int __attribute__ ((noinline)) print_vec_char(__m256i x);
    
    inline __m256i _mm256_shufps_epi32(__m256i a,__m256i b,int imm){return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b),imm));}
    
    int main() {
    
        char *m;
        int i;
        __m256i p0    = _mm256_set_epi8(-1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0,     -1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0);
        __m256i p1    = _mm256_set_epi8(11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1,     11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1);
        __m256i p2    = _mm256_set_epi8(10,7,4,1, 9,6,3,0, -1,-1,-1,-1, 11,8,5,2,     10,7,4,1, 9,6,3,0,-1, -1,-1,-1, 11,8,5,2);
        __m256i p3    = _mm256_set_epi8(9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1,     9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1);
    
                m     = _mm_malloc(96+4,32);   /* 4 extra dummy bytes to avoid errors with _mm_loadu_si128((__m128i*)&m[84]) . Otherwise use maskload instead of standard load */
        for(i = 0; i < 96; i++) m[i] = i;
    
    //        printf("m_lo  ");print_vec_char(_mm256_load_si256((__m256i*)&m[0]));printf("m_mid ");print_vec_char(_mm256_load_si256((__m256i*)&m[32]));printf("m_hi  ");print_vec_char(_mm256_load_si256((__m256i*)&m[64]));printf("\n");
    //        m_lo   31  30  29  28 |  27  26  25  24 |  23  22  21  20 |  19  18  17  16 ||  15  14  13  12 |  11  10   9   8 |   7   6   5   4 |   3   2   1   0 
    //        m_mid  63  62  61  60 |  59  58  57  56 |  55  54  53  52 |  51  50  49  48 ||  47  46  45  44 |  43  42  41  40 |  39  38  37  36 |  35  34  33  32 
    //        m_hi   95  94  93  92 |  91  90  89  88 |  87  86  85  84 |  83  82  81  80 ||  79  78  77  76 |  75  74  73  72 |  71  70  69  68 |  67  66  65  64 
    
        __m256i t0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[0]));
        __m256i t1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[12]));
        __m256i t2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[24]));
        __m256i t3 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[36]));
    
                t0 = _mm256_inserti128_si256(t0,_mm_loadu_si128((__m128i*)&m[48]),1);
                t1 = _mm256_inserti128_si256(t1,_mm_loadu_si128((__m128i*)&m[60]),1);
                t2 = _mm256_inserti128_si256(t2,_mm_loadu_si128((__m128i*)&m[72]),1);
                t3 = _mm256_inserti128_si256(t3,_mm_loadu_si128((__m128i*)&m[84]),1);   /* Use a masked load (_mm_maskload_epi32) here if m[99] is not a valid address */
    
    //        printf("t0  ");print_vec_char(t0);printf("t1  ");print_vec_char(t1);printf("t2  ");print_vec_char(t2);printf("t3  ");print_vec_char(t3);printf("\n");
    //        t0   63  62  61  60 |  59  58  57  56 |  55  54  53  52 |  51  50  49  48 ||  15  14  13  12 |  11  10   9   8 |   7   6   5   4 |   3   2   1   0 
    //        t1   75  74  73  72 |  71  70  69  68 |  67  66  65  64 |  63  62  61  60 ||  27  26  25  24 |  23  22  21  20 |  19  18  17  16 |  15  14  13  12 
    //        t2   87  86  85  84 |  83  82  81  80 |  79  78  77  76 |  75  74  73  72 ||  39  38  37  36 |  35  34  33  32 |  31  30  29  28 |  27  26  25  24 
    //        t3    0   0   0   0 |  95  94  93  92 |  91  90  89  88 |  87  86  85  84 ||  51  50  49  48 |  47  46  45  44 |  43  42  41  40 |  39  38  37  36 
    
                t0 = _mm256_shuffle_epi8(t0,p0);
                t1 = _mm256_shuffle_epi8(t1,p1);
                t2 = _mm256_shuffle_epi8(t2,p2);
                t3 = _mm256_shuffle_epi8(t3,p3);
    
    //        printf("t0  ");print_vec_char(t0);printf("t1  ");print_vec_char(t1);printf("t2  ");print_vec_char(t2);printf("t3  ");print_vec_char(t3);printf("\n");
    //        t0    0   0   0   0 |  59  56  53  50 |  58  55  52  49 |  57  54  51  48 ||   0   0   0   0 |  11   8   5   2 |  10   7   4   1 |   9   6   3   0 
    //        t1   71  68  65  62 |  70  67  64  61 |  69  66  63  60 |   0   0   0   0 ||  23  20  17  14 |  22  19  16  13 |  21  18  15  12 |   0   0   0   0 
    //        t2   82  79  76  73 |  81  78  75  72 |   0   0   0   0 |  83  80  77  74 ||  34  31  28  25 |  33  30  27  24 |   0   0   0   0 |  35  32  29  26 
    //        t3   93  90  87  84 |   0   0   0   0 |  95  92  89  86 |  94  91  88  85 ||  45  42  39  36 |   0   0   0   0 |  47  44  41  38 |  46  43  40  37 
    
        __m256i u0 = _mm256_blend_epi32(t0,t1,0b10101010);
        __m256i u1 = _mm256_blend_epi32(t2,t3,0b10101010);
        __m256i u2 = _mm256_blend_epi32(t0,t1,0b01010101);
        __m256i u3 = _mm256_blend_epi32(t2,t3,0b01010101);
    
    //        printf("u0  ");print_vec_char(u0);printf("u1  ");print_vec_char(u1);printf("u2  ");print_vec_char(u2);printf("u3  ");print_vec_char(u3);printf("\n");
    //        u0   71  68  65  62 |  59  56  53  50 |  69  66  63  60 |  57  54  51  48 ||  23  20  17  14 |  11   8   5   2 |  21  18  15  12 |   9   6   3   0 
    //        u1   93  90  87  84 |  81  78  75  72 |  95  92  89  86 |  83  80  77  74 ||  45  42  39  36 |  33  30  27  24 |  47  44  41  38 |  35  32  29  26 
    //        u2    0   0   0   0 |  70  67  64  61 |  58  55  52  49 |   0   0   0   0 ||   0   0   0   0 |  22  19  16  13 |  10   7   4   1 |   0   0   0   0 
    //        u3   82  79  76  73 |   0   0   0   0 |   0   0   0   0 |  94  91  88  85 ||  34  31  28  25 |   0   0   0   0 |   0   0   0   0 |  46  43  40  37 
    
                t0 = _mm256_blend_epi32(u0,u1,0b11001100);
                t1 = _mm256_shufps_epi32(u2,u3,0b00111001);
                t2 = _mm256_shufps_epi32(u0,u1,0b01001110);
    
            printf("t0  ");print_vec_char(t0);printf("t1  ");print_vec_char(t1);printf("t2  ");print_vec_char(t2);printf("\n");
    //        t0   93  90  87  84 |  81  78  75  72 |  69  66  63  60 |  57  54  51  48 ||  45  42  39  36 |  33  30  27  24 |  21  18  15  12 |   9   6   3   0 
    //        t1   94  91  88  85 |  82  79  76  73 |  70  67  64  61 |  58  55  52  49 ||  46  43  40  37 |  34  31  28  25 |  22  19  16  13 |  10   7   4   1 
    //        t2   95  92  89  86 |  83  80  77  74 |  71  68  65  62 |  59  56  53  50 ||  47  44  41  38 |  35  32  29  26 |  23  20  17  14 |  11   8   5   2 
    
        return 0;
    }
    
    
    
    int __attribute__ ((noinline)) print_vec_char(__m256i x){
        char v[32];
        _mm256_storeu_si256((__m256i *)v,x);
        printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi || ",
               v[31],v[30],v[29],v[28],v[27],v[26],v[25],v[24],v[23],v[22],v[21],v[20],v[19],v[18],v[17],v[16]);
        printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi \n",
               v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
        return 0;
    }
    

    Instruction summary:

        4 vmovdqu
        4 vinserti128-load
        4 vpshufb
        5 vpblendd    (vpblendd is much faster than vpblendvb on most cpu architectures)
        2 vshufps
    

    It is easy to adapt the ideas of these methods to the 16-bit case.

提交回复
热议问题