I was just reading:
Efficiently dividing unsigned value by a power of two, rounding up
and I was wondering what was the fastest way to do this in CUDA. Of course
Here's an adaptation of a well-performing answer for the CPU:
template
__device__ T div_by_power_of_2_rounding_up(T dividend, T divisor)
{
auto log_2_of_divisor = lg(divisor);
auto mask = divisor - 1;
auto correction_for_rounding_up = ((dividend & mask) + mask) >> log_2_of_divisor;
return (dividend >> log_2_of_divisor) + correction_for_rounding_up;
}
I wonder whether one can do much better.
The SASS code (using @RobertCrovella's test kernel) for SM_61 is:
code for sm_61
Function : test(unsigned int, unsigned int)
.headerflags @"EF_CUDA_SM61 EF_CUDA_PTX_SM(EF_CUDA_SM61)"
/* 0x001fd400fe2007f6 */
/*0008*/ MOV R1, c[0x0][0x20]; /* 0x4c98078000870001 */
/*0010*/ IADD R0, RZ, -c[0x0][0x144]; /* 0x4c1100000517ff00 */
/*0018*/ MOV R2, c[0x0][0x144]; /* 0x4c98078005170002 */
/* 0x003fc40007a007f2 */
/*0028*/ LOP.AND R0, R0, c[0x0][0x144]; /* 0x4c47000005170000 */
/*0030*/ FLO.U32 R3, R0; /* 0x5c30000000070003 */
/*0038*/ IADD32I R0, R2, -0x1; /* 0x1c0ffffffff70200 */
/* 0x001fc400fcc017f5 */
/*0048*/ IADD32I R3, -R3, 0x1f; /* 0x1d00000001f70303 */
/*0050*/ LOP.AND R2, R0, c[0x0][0x140]; /* 0x4c47000005070002 */
/*0058*/ IADD R2, R0, R2; /* 0x5c10000000270002 */
/* 0x001fd000fe2007f1 */
/*0068*/ IADD32I R0, -R3, 0x1f; /* 0x1d00000001f70300 */
/*0070*/ MOV R3, c[0x0][0x140]; /* 0x4c98078005070003 */
/*0078*/ MOV32I R6, 0x0; /* 0x010000000007f006 */
/* 0x001fc400fc2407f1 */
/*0088*/ SHR.U32 R4, R2, R0.reuse; /* 0x5c28000000070204 */
/*0090*/ SHR.U32 R5, R3, R0; /* 0x5c28000000070305 */
/*0098*/ MOV R2, R6; /* 0x5c98078000670002 */
/* 0x0003c400fe4007f4 */
/*00a8*/ MOV32I R3, 0x0; /* 0x010000000007f003 */
/*00b0*/ IADD R0, R4, R5; /* 0x5c10000000570400 */
/*00b8*/ STG.E [R2], R0; /* 0xeedc200000070200 */
/* 0x001f8000ffe007ff */
/*00c8*/ EXIT; /* 0xe30000000007000f */
/*00d0*/ BRA 0xd0; /* 0xe2400fffff87000f */
/*00d8*/ NOP; /* 0x50b0000000070f00 */
/* 0x001f8000fc0007e0 */
/*00e8*/ NOP; /* 0x50b0000000070f00 */
/*00f0*/ NOP; /* 0x50b0000000070f00 */
/*00f8*/ NOP; /* 0x50b0000000070f00 */
with FLO
being the "find leading 1" instruction (thanks @tera). Anyway, those are lots of instructions, even if you ignore the loads from (what looks like) constant memory... the CPU function inspiring this one compiles into just:
tzcnt rax, rsi
lea rcx, [rdi - 1]
shrx rax, rcx, rax
add rax, 1
test rdi, rdi
cmove rax, rdi
(with clang 3.9.0).