Fast 24-bit array -> 32-bit array conversion?

后端 未结 4 1748
终归单人心
终归单人心 2020-12-08 22:47

Quick Summary:

I have an array of 24-bit values. Any suggestion on how to quickly expand the individual 24-bit array elements into 32-bit

相关标签:
4条回答
  • 2020-12-08 23:24

    SSE 4.1 .ASM:

    PINSRD  XMM0,  DWORD PTR[ESI],   0
    PINSRD  XMM0,  DWORD PTR[ESI+3], 1
    PINSRD  XMM0,  DWORD PTR[ESI+6], 2
    PINSRD  XMM0,  DWORD PTR[ESI+9], 3
    PSLLD   XMM0,  8                    
    PSRLD   XMM0,  8
    MOVNTDQ [EDI], XMM1
    add     ESI,   12
    add     EDI,   16
    
    0 讨论(0)
  • 2020-12-08 23:27

    The different input/output sizes are not a barrier to using simd, just a speed bump. You would need to chunk the data so that you read and write in full simd words (16 bytes).

    In this case, you would read 3 SIMD words (48 bytes == 16 rgb pixels), do the expansion, then write 4 SIMD words.

    I'm just saying you can use SIMD, I'm not saying you should. The middle bit, the expansion, is still tricky since you have non-uniform shift sizes in different parts of the word.

    0 讨论(0)
  • 2020-12-08 23:31

    The code below should be pretty fast. It copies 4 pixels in each iteration, using only 32-bit read/write instructions. The source and destination pointers should be aligned to 32 bits.

    uint32_t *src = ...;
    uint32_t *dst = ...;
    
    for (int i=0; i<num_pixels; i+=4) {
        uint32_t sa = src[0];
        uint32_t sb = src[1];
        uint32_t sc = src[2];
    
        dst[i+0] = sa;
        dst[i+1] = (sa>>24) | (sb<<8);
        dst[i+2] = (sb>>16) | (sc<<16);
        dst[i+3] = sc>>8;
    
        src += 3;
    }
    

    Edit:

    Here is a way to do this using the SSSE3 instructions PSHUFB and PALIGNR. The code is written using compiler intrinsics, but it shouldn't be hard to translate to assembly if needed. It copies 16 pixels in each iteration. The source and destination pointers Must be aligned to 16 bytes, or it will fault. If they aren't aligned, you can make it work by replacing _mm_load_si128 with _mm_loadu_si128 and _mm_store_si128 with _mm_storeu_si128, but this will be slower.

    #include <emmintrin.h>
    #include <tmmintrin.h>
    
    __m128i *src = ...;
    __m128i *dst = ...;
    __m128i mask = _mm_setr_epi8(0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);
    
    for (int i=0; i<num_pixels; i+=16) {
        __m128i sa = _mm_load_si128(src);
        __m128i sb = _mm_load_si128(src+1);
        __m128i sc = _mm_load_si128(src+2);
    
        __m128i val = _mm_shuffle_epi8(sa, mask);
        _mm_store_si128(dst, val);
        val = _mm_shuffle_epi8(_mm_alignr_epi8(sb, sa, 12), mask);
        _mm_store_si128(dst+1, val);
        val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sb, 8), mask);
        _mm_store_si128(dst+2, val);
        val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sc, 4), mask);
        _mm_store_si128(dst+3, val);
    
        src += 3;
        dst += 4;
    }
    

    SSSE3 (not to be confused with SSE3) will require a relatively new processor: Core 2 or newer, and I believe AMD doesn't support it yet. Performing this with SSE2 instructions only will take a lot more operations, and may not be worth it.

    0 讨论(0)
  • 2020-12-08 23:37

    SSE3 is awesome, but for those who can't use it for whatever reason, here's the conversion in x86 assembler, hand-optimized by yours truly. For completeness, I give the conversion in both directions: RGB32->RGB24 and RGB24->RGB32.

    Note that interjay's C code leaves trash in the MSB (the alpha channel) of the destination pixels. This might not matter in some applications, but it matters in mine, hence my RGB24->RGB32 code forces the MSB to zero. Similarly, my RGB32->RGB24 code ignores the MSB; this avoids garbage output if the source data has a non-zero alpha channel. These features cost almost nothing in terms of performance, as verified by benchmarks.

    For RGB32->RGB24 I was able to beat the VC++ optimizer by about 20%. For RGB24->RGB32 the gain was insignificant. Benchmarking was done on an i5 2500K. I omit the benchmarking code here, but if anyone wants it I'll provide it. The most important optimization was bumping the source pointer as soon as possible (see the ASAP comment). My best guess is that this increases parallelism by allowing the instruction pipeline to prefetch sooner. Other than that I just reordered some instructions to reduce dependencies and overlap memory accesses with bit-bashing.

    void ConvRGB32ToRGB24(const UINT *Src, UINT *Dst, UINT Pixels)
    {
    #if !USE_ASM
        for (UINT i = 0; i < Pixels; i += 4) {
            UINT    sa = Src[i + 0] & 0xffffff;
            UINT    sb = Src[i + 1] & 0xffffff;
            UINT    sc = Src[i + 2] & 0xffffff;
            UINT    sd = Src[i + 3];
            Dst[0] = sa | (sb << 24);
            Dst[1] = (sb >> 8) | (sc << 16);
            Dst[2] = (sc >> 16) | (sd << 8);
            Dst += 3;
        }
    #else
        __asm {
            mov     ecx, Pixels
            shr     ecx, 2              // 4 pixels at once
            jz      ConvRGB32ToRGB24_$2
            mov     esi, Src
            mov     edi, Dst
    ConvRGB32ToRGB24_$1:
            mov     ebx, [esi + 4]      // sb
            and     ebx, 0ffffffh       // sb & 0xffffff
            mov     eax, [esi + 0]      // sa
            and     eax, 0ffffffh       // sa & 0xffffff
            mov     edx, ebx            // copy sb
            shl     ebx, 24             // sb << 24
            or      eax, ebx            // sa | (sb << 24)
            mov     [edi + 0], eax      // Dst[0]
            shr     edx, 8              // sb >> 8
            mov     eax, [esi + 8]      // sc
            and     eax, 0ffffffh       // sc & 0xffffff
            mov     ebx, eax            // copy sc
            shl     eax, 16             // sc << 16
            or      eax, edx            // (sb >> 8) | (sc << 16)
            mov     [edi + 4], eax      // Dst[1]
            shr     ebx, 16             // sc >> 16
            mov     eax, [esi + 12]     // sd
            add     esi, 16             // Src += 4 (ASAP)
            shl     eax, 8              // sd << 8
            or      eax, ebx            // (sc >> 16) | (sd << 8)
            mov     [edi + 8], eax      // Dst[2]
            add     edi, 12             // Dst += 3
            dec     ecx
            jnz     SHORT ConvRGB32ToRGB24_$1
    ConvRGB32ToRGB24_$2:
        }
    #endif
    }
    
    void ConvRGB24ToRGB32(const UINT *Src, UINT *Dst, UINT Pixels)
    {
    #if !USE_ASM
        for (UINT i = 0; i < Pixels; i += 4) {
            UINT    sa = Src[0];
            UINT    sb = Src[1];
            UINT    sc = Src[2];
            Dst[i + 0] = sa & 0xffffff;
            Dst[i + 1] = ((sa >> 24) | (sb << 8)) & 0xffffff;
            Dst[i + 2] = ((sb >> 16) | (sc << 16)) & 0xffffff;
            Dst[i + 3] = sc >> 8;
            Src += 3;
        }
    #else
        __asm {
            mov     ecx, Pixels
            shr     ecx, 2              // 4 pixels at once
            jz      SHORT ConvRGB24ToRGB32_$2
            mov     esi, Src
            mov     edi, Dst
            push    ebp
    ConvRGB24ToRGB32_$1:
            mov     ebx, [esi + 4]      // sb
            mov     edx, ebx            // copy sb
            mov     eax, [esi + 0]      // sa
            mov     ebp, eax            // copy sa
            and     ebx, 0ffffh         // sb & 0xffff
            shl     ebx, 8              // (sb & 0xffff) << 8
            and     eax, 0ffffffh       // sa & 0xffffff
            mov     [edi + 0], eax      // Dst[0]
            shr     ebp, 24             // sa >> 24
            or      ebx, ebp            // (sa >> 24) | ((sb & 0xffff) << 8)
            mov     [edi + 4], ebx      // Dst[1]
            shr     edx, 16             // sb >> 16
            mov     eax, [esi + 8]      // sc
            add     esi, 12             // Src += 12 (ASAP)
            mov     ebx, eax            // copy sc
            and     eax, 0ffh           // sc & 0xff
            shl     eax, 16             // (sc & 0xff) << 16
            or      eax, edx            // (sb >> 16) | ((sc & 0xff) << 16)
            mov     [edi + 8], eax      // Dst[2]
            shr     ebx, 8              // sc >> 8
            mov     [edi + 12], ebx     // Dst[3]
            add     edi, 16             // Dst += 16
            dec     ecx
            jnz     SHORT ConvRGB24ToRGB32_$1
            pop     ebp
    ConvRGB24ToRGB32_$2:
        }
    #endif
    }
    

    And while we're at it, here are the same conversions in actual SSE3 assembly. This only works if you have an assembler (FASM is free) and have a CPU that supports SSE3 (likely but it's better to check). Note that the intrinsics don't necessarily output something this efficient, it totally depends on the tools you use and what platform you're compiling for. Here, it's straightforward: what you see is what you get. This code generates the same output as the x86 code above, and it's about 1.5x faster (on an i5 2500K).

    format MS COFF
    
    section '.text' code readable executable
    
    public _ConvRGB32ToRGB24SSE3
    
    ;   ebp + 8     Src (*RGB32, 16-byte aligned)
    ;   ebp + 12    Dst (*RGB24, 16-byte aligned)
    ;   ebp + 16    Pixels
    
    _ConvRGB32ToRGB24SSE3:
        push    ebp
        mov     ebp, esp
        mov     eax, [ebp + 8]
        mov     edx, [ebp + 12]
        mov     ecx, [ebp + 16]
        shr     ecx, 4
        jz      done1
        movupd  xmm7, [mask1]
    
    top1:
        movupd  xmm0, [eax + 0]     ; sa = Src[0]
        pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
        movupd  xmm1, [eax + 16]    ; sb = Src[1]
        pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask)
        movupd  xmm2, xmm1          ; sb1 = sb
        pslldq  xmm1, 12            ; sb = _mm_slli_si128(sb, 12)
        por     xmm0, xmm1          ; sa = _mm_or_si128(sa, sb)
        movupd  [edx + 0], xmm0     ; Dst[0] = sa
        psrldq  xmm2, 4             ; sb1 = _mm_srli_si128(sb1, 4)
        movupd  xmm0, [eax + 32]    ; sc = Src[2]
        pshufb  xmm0, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
        movupd  xmm1, xmm0          ; sc1 = sc
        pslldq  xmm0, 8             ; sc = _mm_slli_si128(sc, 8)
        por     xmm0, xmm2          ; sc = _mm_or_si128(sb1, sc)
        movupd  [edx + 16], xmm0    ; Dst[1] = sc
        psrldq  xmm1, 8             ; sc1 = _mm_srli_si128(sc1, 8)
        movupd  xmm0, [eax + 48]    ; sd = Src[3]
        pshufb  xmm0, xmm7          ; sd = _mm_shuffle_epi8(sd, mask)
        pslldq  xmm0, 4             ; sd = _mm_slli_si128(sd, 4)
        por     xmm0, xmm1          ; sd = _mm_or_si128(sc1, sd)
        movupd  [edx + 32], xmm0    ; Dst[2] = sd
        add     eax, 64
        add     edx, 48
        dec     ecx
        jnz     top1
    
    done1:
        pop     ebp
        ret
    
    public _ConvRGB24ToRGB32SSE3
    
    ;   ebp + 8     Src (*RGB24, 16-byte aligned)
    ;   ebp + 12    Dst (*RGB32, 16-byte aligned)
    ;   ebp + 16    Pixels
    
    _ConvRGB24ToRGB32SSE3:
        push    ebp
        mov     ebp, esp
        mov     eax, [ebp + 8]
        mov     edx, [ebp + 12]
        mov     ecx, [ebp + 16]
        shr     ecx, 4
        jz      done2
        movupd  xmm7, [mask2]
    
    top2:
        movupd  xmm0, [eax + 0]     ; sa = Src[0]
        movupd  xmm1, [eax + 16]    ; sb = Src[1]
        movupd  xmm2, [eax + 32]    ; sc = Src[2]
        movupd  xmm3, xmm0          ; sa1 = sa
        pshufb  xmm0, xmm7          ; sa = _mm_shuffle_epi8(sa, mask)
        movupd  [edx], xmm0         ; Dst[0] = sa
        movupd  xmm4, xmm1          ; sb1 = sb
        palignr xmm1, xmm3, 12      ; sb = _mm_alignr_epi8(sb, sa1, 12)
        pshufb  xmm1, xmm7          ; sb = _mm_shuffle_epi8(sb, mask);
        movupd  [edx + 16], xmm1    ; Dst[1] = sb
        movupd  xmm3, xmm2          ; sc1 = sc
        palignr xmm2, xmm4, 8       ; sc = _mm_alignr_epi8(sc, sb1, 8)
        pshufb  xmm2, xmm7          ; sc = _mm_shuffle_epi8(sc, mask)
        movupd  [edx + 32], xmm2    ; Dst[2] = sc
        palignr xmm3, xmm3, 4       ; sc1 = _mm_alignr_epi8(sc1, sc1, 4)
        pshufb  xmm3, xmm7          ; sc1 = _mm_shuffle_epi8(sc1, mask)
        movupd  [edx + 48], xmm3    ; Dst[3] = sc1
        add     eax, 48
        add     edx, 64
        dec     ecx
        jnz     top2
    
    done2:
        pop     ebp
        ret
    
    section '.data' data readable writeable align 16
    
    label mask1 dqword 
        db  0,1,2,4, 5,6,8,9, 10,12,13,14, -1,-1,-1,-1
    label mask2 dqword 
        db  0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1
    
    0 讨论(0)
提交回复
热议问题