Fast vectorized conversion from RGB to BGRA

后端 未结 4 708
夕颜
夕颜 2020-12-10 07:32

In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE

4条回答
  •  無奈伤痛
    2020-12-10 07:49

    This is an example of using SSSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.

    #include 
    
    /* in and out must be 16-byte aligned */
    void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
    {
        const __m128i *in_vec = in;
        __m128i *out_vec = out;
    
        w /= 16;
    
        while (w-- > 0) {
            /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
             * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
             * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
             * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
             */
            __m128i in1, in2, in3;
            __m128i out;
    
            in1 = in_vec[0];
    
            out = _mm_shuffle_epi8(in1,
                _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
            out = _mm_or_si128(out,
                _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
            out_vec[0] = out;
    
            in2 = in_vec[1];
    
            in1 = _mm_and_si128(in1,
                _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
            out = _mm_and_si128(in2,
                _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
            out = _mm_or_si128(out, in1);
            out = _mm_shuffle_epi8(out,
                _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
            out = _mm_or_si128(out,
                _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
            out_vec[1] = out;
    
            in3 = in_vec[2];
            in_vec += 3;
    
            in2 = _mm_and_si128(in2,
                _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
            out = _mm_and_si128(in3,
                _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
            out = _mm_or_si128(out, in2);
            out = _mm_shuffle_epi8(out,
                _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
            out = _mm_or_si128(out,
                _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
            out_vec[2] = out;
    
            out = _mm_shuffle_epi8(in3,
                _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
            out = _mm_or_si128(out,
                _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
            out_vec[3] = out;
    
            out_vec += 4;
        }
    }
    

提交回复
热议问题