Embedded broadcasts with intrinsics and assembly

谁说胖子不能爱 提交于 2019-11-29 14:40:14

As Peter Cordes notes GCC doesn't let you specify a different template for different constraint alternatives. So instead my solution has the assembler choose the correct instruction according to the operands chosen.

I don't have a version of GCC that supports the ZMM registers, so this following example uses XMM registers and a couple of nonexistent instructions to demonstrate how you can achieve what you're looking for.

typedef __attribute__((vector_size(16))) float v4sf;

v4sf
foo(v4sf a, float b) {
    v4sf ret;
    asm(".ifndef isxmm\n\t"
        ".altmacro\n\t"
        ".macro ifxmm operand, rnum\n\t"
        ".ifc \"\\operand\",\"%%xmm\\rnum\"\n\t"
        ".set isxmm, 1\n\t"
        ".endif\n\t"
        ".endm\n\t"
        ".endif\n\t"
        ".set isxmm, 0\n\t"
        ".set regnum, 0\n\t"
        ".rept 8\n\t"
        "ifxmm <%2>, %%regnum\n\t"
        ".set regnum, regnum + 1\n\t"
        ".endr\n\t"
        ".if isxmm\n\t"
        "alt-1 %1, %2, %0\n\t"
        ".else\n\t"
        "alt-2 %1, %2, %0\n\t"
        ".endif\n\t"
        : "=x,x" (ret)
        : "x,x" (a), "x,m" (b));
    return ret;
}


v4sf
bar(v4sf a, v4sf b) {
    return foo(a, b[0]);
}

This example should be compiled with gcc -m32 -msse -O3 and should generate two assembler error messages similar to the following:

t103.c: Assembler messages:
t103.c:24: Error: no such instruction: `alt-2 %xmm0,4(%esp),%xmm0'
t103.c:22: Error: no such instruction: `alt-1 %xmm0,%xmm1,%xmm0'

The basic idea here is the assembler checks to see whether the second operand (%2) is an XMM register or something else, presumably a memory location. Since the GNU assembler doesn't support much in the way of operations on strings, the second operand is compared to every possible XMM register one at a time in a .rept loop. The isxmm macro is used to paste %xmm and a register number together.

For your specific problem you'd probably need to rewrite it something like this:

__m512
mul_broad(__m512 a, float b) {
    __m512 ret;
    __m512 dummy;
    asm(".ifndef isxmm\n\t"
        ".altmacro\n\t"
        ".macro ifxmm operand, rnum\n\t"
        ".ifc \"\\operand\",\"%%zmm\\rnum\"\n\t"
        ".set isxmm, 1\n\t"
        ".endif\n\t"
        ".endm\n\t"
        ".endif\n\t"
        ".set isxmm, 0\n\t"
        ".set regnum, 0\n\t"
        ".rept 32\n\t"
        "ifxmm <%[b]>, %%regnum\n\t"
        ".set regnum, regnum + 1\n\t"
        ".endr\n\t"
        ".if isxmm\n\t"
        "vbroadcastss %x[b], %[b]\n\t"
        "vmulps %[a], %[b], %[ret]\n\t"
        ".else\n\t"
        "vmulps %[b] %{1to16%}, %[a], %[ret]\n\t"
        "# dummy = %[dummy]\n\t"
        ".endif\n\t"
        : [ret] "=x,x" (ret), [dummy] "=xm,x" (dummy)
        : [a] "x,xm" (a), [b] "m,[dummy]" (b));
    return ret;
}
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!