How to populate a 64 bit register with duplicate byte values

后端 未结 3 867
灰色年华
灰色年华 2020-12-21 00:11

I\'m doing some x64 assembly with Visual C++ 2010 and masm (\'fast call\' calling convention).

So let\'s say I have a function in C++:

extern \"C\" v         


        
3条回答
  •  悲&欢浪女
    2020-12-21 00:51

    Because you called your procedure 'fillArray', I assumed you like to fill a whole memory block with a byte value. So I did a comparision on different approaches. It is 32 bit masm code, but the results should be similar in 64 bit mode. Each approach is tested with both aligned and unaligned buffers. Here are the results:

    Simple REP STOSB - aligned....: 192
    Simple REP STOSB - not aligned: 192
    Simple REP STOSD - aligned....: 191
    Simple REP STOSD - not aligned: 222
    Simple while loop - aligned....: 267
    Simple while loop - not aligned: 261
    Simple while loop with different addressing - aligned....: 271
    Simple while loop with different addressing - not aligned: 262
    Loop with 16-byte SSE write - aligned....: 192
    Loop with 16-byte SSE write - not aligned: 205
    Loop with 16-byte SSE write non-temporal hint - aligned....: 126 (EDIT)
    

    The most naive variant using the following code seems to perform best in both scenarios and has the smallest code size as well:

    cld
    mov al, 44h   ; byte value
    mov edi, lpDst
    mov ecx, 256000*4  ; buf size
    rep stosb
    

    EDIT: It's not the fastest for aligned data. Added MOVNTDQ version which performs best, see below.

    For the sake of completeness, here are excerpts from the other routines - the value is assumed to be expanded into EAX before:

    Rep Stosd:

    mov edi, lpDst
    mov ecx, 256000
    rep stosd
    

    Simple While:

    mov edi, lpDst
    mov ecx, 256000
    .while ecx>0
        mov [edi],eax
        add edi,4
        dec ecx
    .endw
    

    Different simple while:

    mov edi, lpDst
    xor ecx, ecx
    .while ecx<256000 
        mov [edi+ecx*4],eax
        inc ecx
    .endw
    

    SSE(both):

    movd xmm0,eax
    punpckldq xmm0,xmm0    ; xxxxxxxxGGGGHHHH -> xxxxxxxxHHHHHHHH
    punpcklqdq xmm0,xmm0   ; xxxxxxxxHHHHHHHH -> HHHHHHHHHHHHHHHH
    mov ecx, 256000/4   ; 16 byte
    mov edi, lpDst
    .while ecx>0 
        movdqa xmmword ptr [edi],xmm0    ; movdqu for unaligned
        add edi,16
        dec ecx
    .endw
    

    SSE(NT,aligned,EDIT):

    movd xmm0,eax
    punpckldq xmm0,xmm0    ; xxxxxxxxGGGGHHHH -> xxxxxxxxHHHHHHHH
    punpcklqdq xmm0,xmm0   ; xxxxxxxxHHHHHHHH -> HHHHHHHHHHHHHHHH
    mov ecx, 256000/4   ; 16 byte
    mov edi, lpDst
    .while ecx>0 
        movntdq xmmword ptr [edi],xmm0
        add edi,16
        dec ecx
    .endw
    

    I uploaded the whole code here http://pastie.org/9831404 --- the MASM package from hutch is required for assembling.


    If SSSE3 is available, you can use pshufb to broadcast a byte to all positions of a register instead of a chain of punpck instructions.

    movd    xmm0, edx
    xorps   xmm1,xmm1      ; xmm1 = 0
    pshufb  xmm0, xmm1     ; xmm0 = _mm_set1_epi8(dl)
    

提交回复
热议问题