How to populate a 64 bit register with duplicate byte values

后端 未结 3 865
灰色年华
灰色年华 2020-12-21 00:11

I\'m doing some x64 assembly with Visual C++ 2010 and masm (\'fast call\' calling convention).

So let\'s say I have a function in C++:

extern \"C\" v         


        
相关标签:
3条回答
  • 2020-12-21 00:32

    Naïve way

    xor ebx, ebx
    mov bl, dl   ; input in dl
    mov bh, dl
    mov eax, ebx
    shl ebx, 16
    or  ebx, eax
    mov eax, ebx
    shl rax, 32
    or  rax, rbx ; output in rax
    

    So it might be slower than harold's solution

    You can also look at the compiler's assembly output for the following code

    uint64_t s;
    s = (s << 8)  | s;
    s = (s << 16) | s;
    s = (s << 32) | s;
    

    gcc 8.2 generates the following output with the result in rax

    movzx   edi, dil      # s, c
    mov     rax, rdi  # _1, s
    sal     rax, 8    # _1,
    or      rdi, rax    # s, _1
    mov     rax, rdi  # _2, s
    sal     rax, 16   # _2,
    or      rax, rdi    # s, s
    mov     rdi, rax  # _3, s
    sal     rdi, 32   # _3,
    or      rax, rdi    # s, _3
    ret 
    

    As you can see, it's not very efficient since the compiler always uses 64-bit registers even when not necessary. So to make it easier for compilers to optimize just modify it like this

    uint32_t s = (c << 8) | c;
    s = (s << 16) | s;
    return ((uint64_t)s << 32) | s;
    

    Note that this is only for filling a single register like RAX with values using DL as you mentioned. For a big array it's better to use SIMD or other specialized instructions such as repstos, like how std::fill or memset is implemented. Both gcc and Clang are also smart enough to recognize memset(&int64, bytevalue, sizeof int64) and convert it to a multiplication by 0x0101010101010101 like above

    0 讨论(0)
  • 2020-12-21 00:51

    Because you called your procedure 'fillArray', I assumed you like to fill a whole memory block with a byte value. So I did a comparision on different approaches. It is 32 bit masm code, but the results should be similar in 64 bit mode. Each approach is tested with both aligned and unaligned buffers. Here are the results:

    Simple REP STOSB - aligned....: 192
    Simple REP STOSB - not aligned: 192
    Simple REP STOSD - aligned....: 191
    Simple REP STOSD - not aligned: 222
    Simple while loop - aligned....: 267
    Simple while loop - not aligned: 261
    Simple while loop with different addressing - aligned....: 271
    Simple while loop with different addressing - not aligned: 262
    Loop with 16-byte SSE write - aligned....: 192
    Loop with 16-byte SSE write - not aligned: 205
    Loop with 16-byte SSE write non-temporal hint - aligned....: 126 (EDIT)
    

    The most naive variant using the following code seems to perform best in both scenarios and has the smallest code size as well:

    cld
    mov al, 44h   ; byte value
    mov edi, lpDst
    mov ecx, 256000*4  ; buf size
    rep stosb
    

    EDIT: It's not the fastest for aligned data. Added MOVNTDQ version which performs best, see below.

    For the sake of completeness, here are excerpts from the other routines - the value is assumed to be expanded into EAX before:

    Rep Stosd:

    mov edi, lpDst
    mov ecx, 256000
    rep stosd
    

    Simple While:

    mov edi, lpDst
    mov ecx, 256000
    .while ecx>0
        mov [edi],eax
        add edi,4
        dec ecx
    .endw
    

    Different simple while:

    mov edi, lpDst
    xor ecx, ecx
    .while ecx<256000 
        mov [edi+ecx*4],eax
        inc ecx
    .endw
    

    SSE(both):

    movd xmm0,eax
    punpckldq xmm0,xmm0    ; xxxxxxxxGGGGHHHH -> xxxxxxxxHHHHHHHH
    punpcklqdq xmm0,xmm0   ; xxxxxxxxHHHHHHHH -> HHHHHHHHHHHHHHHH
    mov ecx, 256000/4   ; 16 byte
    mov edi, lpDst
    .while ecx>0 
        movdqa xmmword ptr [edi],xmm0    ; movdqu for unaligned
        add edi,16
        dec ecx
    .endw
    

    SSE(NT,aligned,EDIT):

    movd xmm0,eax
    punpckldq xmm0,xmm0    ; xxxxxxxxGGGGHHHH -> xxxxxxxxHHHHHHHH
    punpcklqdq xmm0,xmm0   ; xxxxxxxxHHHHHHHH -> HHHHHHHHHHHHHHHH
    mov ecx, 256000/4   ; 16 byte
    mov edi, lpDst
    .while ecx>0 
        movntdq xmmword ptr [edi],xmm0
        add edi,16
        dec ecx
    .endw
    

    I uploaded the whole code here http://pastie.org/9831404 --- the MASM package from hutch is required for assembling.


    If SSSE3 is available, you can use pshufb to broadcast a byte to all positions of a register instead of a chain of punpck instructions.

    movd    xmm0, edx
    xorps   xmm1,xmm1      ; xmm1 = 0
    pshufb  xmm0, xmm1     ; xmm0 = _mm_set1_epi8(dl)
    
    0 讨论(0)
  • 2020-12-21 00:52

    You can multiply by 0x0101010101010101 to copy the lowest byte into all other bytes (assuming the rest were all zero to begin with), it's slightly annoying because there is no imul r64, r64, imm64 but you can could do this:

    mov  rax, 0x0101010101010101 
    imul rax, rdx                 ; at least as fast as  mul rdx  on all CPUs
    

    If rdx is not of the required form (in other words, if it has some extra bits set), just add a
    movzx eax, dl in front, and move the constant into RDX or another register. (movzx edx,dl can't benefit from mov-elimination on Intel CPUs.)

    If you don't like the code size (mov r64, imm64 is already 10 bytes by itself), just stick that constant in your data segment.

    0 讨论(0)
提交回复
热议问题