In C, why is signed int
faster than unsigned int
? True, I know that this has been asked and answered multiple times on this website (links below). However, most people said that there is no difference. I have written code and accidentally found a significant performance difference.
Why would the "unsigned" version of my code be slower than the "signed" version (even when testing the same number)? (I have a x86-64 Intel processor).
Similar links
Compile Command: gcc -Wall -Wextra -pedantic -O3 -Wl,-O3 -g0 -ggdb0 -s -fwhole-program -funroll-loops -pthread -pipe -ffunction-sections -fdata-sections -std=c11 -o ./test ./test.c && strip --strip-all --strip-unneeded --remove-section=.note --remove-section=.comment ./test
signed int
version
NOTE: There is no difference if I explicitly declare signed int
on all numbers.
int isprime(int num) {
// Test if a signed int is prime
int i;
if (num % 2 == 0 || num % 3 == 0)
return 0;
else if (num % 5 == 0 || num % 7 == 0)
return 0;
else {
for (i = 11; i < num; i += 2) {
if (num % i == 0) {
if (i != num)
return 0;
else
return 1;
}
}
}
return 1;
}
unsigned int
version
int isunsignedprime(unsigned int num) {
// Test if an unsigned int is prime
unsigned int i;
if (num % (unsigned int)2 == (unsigned int)0 || num % (unsigned int)3 == (unsigned int)0)
return 0;
else if (num % (unsigned int)5 == (unsigned int)0 || num % (unsigned int)7 == (unsigned int)0)
return 0;
else {
for (i = (unsigned int)11; i < num; i += (unsigned int)2) {
if (num % i == (unsigned int)0) {
if (i != num)
return 0;
else
return 1;
}
}
}
return 1;
}
Test this in a file with the below code:
int main(void) {
printf("%d\n", isprime(294967291));
printf("%d\n", isprime(294367293));
printf("%d\n", isprime(294967293));
printf("%d\n", isprime(294967241)); // slow
printf("%d\n", isprime(294967251));
printf("%d\n", isprime(294965291));
printf("%d\n", isprime(294966291));
printf("%d\n", isprime(294963293));
printf("%d\n", isprime(294927293));
printf("%d\n", isprime(294961293));
printf("%d\n", isprime(294917293));
printf("%d\n", isprime(294167293));
printf("%d\n", isprime(294267293));
printf("%d\n", isprime(294367293)); // slow
printf("%d\n", isprime(294467293));
return 0;
}
Results (time ./test
):
Signed - real 0m0.949s
Unsigned - real 0m1.174s
Your question is genuinely intriguing as the unsigned version consistently produces code that is 10 to 20% slower. Yet there are multiple problems in the code:
- Both functions return
0
for2
,3
,5
and7
, which is incorrect. - The test
if (i != num) return 0; else return 1;
is completely useless as the loop body is only run fori < num
. Such a test would be useful for the small prime tests but special casing them is not really useful. - the casts in the unsigned version are redundant.
- benchmarking code that produces textual output to the terminal is unreliable, you should use the
clock()
function to time CPU intensive functions without any intervening I/O. - the algorithm for prime testing is utterly inefficient as the loop runs
num / 2
times instead ofsqrt(num)
.
Let's simplify the code and run some precise benchmarks:
#include <stdio.h>
#include <time.h>
int isprime_slow(int num) {
if (num % 2 == 0)
return num == 2;
for (int i = 3; i < num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int unsigned_isprime_slow(unsigned int num) {
if (num % 2 == 0)
return num == 2;
for (unsigned int i = 3; i < num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int isprime_fast(int num) {
if (num % 2 == 0)
return num == 2;
for (int i = 3; i * i <= num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int unsigned_isprime_fast(unsigned int num) {
if (num % 2 == 0)
return num == 2;
for (unsigned int i = 3; i * i <= num; i += 2) {
if (num % i == 0)
return 0;
}
return 1;
}
int main(void) {
int a[] = {
294967291, 0, 294367293, 0, 294967293, 0, 294967241, 1, 294967251, 0,
294965291, 0, 294966291, 0, 294963293, 0, 294927293, 1, 294961293, 0,
294917293, 0, 294167293, 0, 294267293, 0, 294367293, 0, 294467293, 0,
};
struct testcase { int (*fun)(); const char *name; int t; } test[] = {
{ isprime_slow, "isprime_slow", 0 },
{ unsigned_isprime_slow, "unsigned_isprime_slow", 0 },
{ isprime_fast, "isprime_fast", 0 },
{ unsigned_isprime_fast, "unsigned_isprime_fast", 0 },
};
for (int n = 0; n < 4; n++) {
clock_t t = clock();
for (int i = 0; i < 30; i += 2) {
if (test[n].fun(a[i]) != a[i + 1]) {
printf("%s(%d) != %d\n", test[n].name, a[i], a[i + 1]);
}
}
test[n].t = clock() - t;
}
for (int n = 0; n < 4; n++) {
printf("%21s: %4d.%03dms\n", test[n].name, test[n].t / 1000), test[n].t % 1000);
}
return 0;
}
The code compiled with clang -O2
on OS/X produces this output:
isprime_slow: 788.004ms
unsigned_isprime_slow: 965.381ms
isprime_fast: 0.065ms
unsigned_isprime_fast: 0.089ms
These timings are consistent with the OP's observed behavior on a different system, but show the dramatic improvement caused by the more efficient iteration test: 10000 times faster!
Regarding the question Why is the function slower with unsigned?, let's look at the generated code (gcc 7.2 -O2):
isprime_slow(int):
...
.L5:
movl %edi, %eax
cltd
idivl %ecx
testl %edx, %edx
je .L1
.L4:
addl $2, %ecx
cmpl %esi, %ecx
jne .L5
.L6:
movl $1, %edx
.L1:
movl %edx, %eax
ret
unsigned_isprime_slow(unsigned int):
...
.L19:
xorl %edx, %edx
movl %edi, %eax
divl %ecx
testl %edx, %edx
je .L22
.L18:
addl $2, %ecx
cmpl %esi, %ecx
jne .L19
.L20:
movl $1, %eax
ret
...
.L22:
xorl %eax, %eax
ret
The inner loops are very similar, same number of instructions, similar instructions. Here are however some potential explanations:
cltd
extends the sign of theeax
register into theedx
register, which may be causing an instruction delay becauseeax
is modified by the immediately preceeding instructionmovl %edi, %eax
. Yet this would make the signed version slower than the unsigned one, not faster.- the loops' initial instructions might be misaligned for the unsigned version, but it is unlikely as changing the order in the source code has no effect on the timings.
- Although the register contents are identical for the signed and unsigned division opcodes, it is possible that the
idivl
instruction take fewer cycles than thedivl
instruction. Indeed the signed division operates on one less bit of precision than the unsigned division, but the difference seems quite large for this small change. - I suspect more effort was put into the silicon implementation of
idivl
because signed divisions are more common that unsigned divisions (as measured by years of coding statistics at Intel). - as commented by rcgldr, looking at instruction tables for Intel process, for Ivy Bridge, DIV 32 bit takes 10 micro ops, 19 to 27 cycles, IDIV 9 micro ops, 19 to 26 cycles. The benchmark times are consistent with these timings. The extra micro-op may be due to the longer operands in DIV (64/32 bits) as opposed to IDIV (63/31 bits).
This surprising result should teach us a few lessons:
- optimizing is a difficult art, be humble and procrastinate.
- correctness is often broken by optimizations.
- choosing a better algorithm beats optimization by a long shot.
- always benchmark code, do not trust your instincts.
Because signed integer overflow is undefined, the compiler can make a lot of assumptions and optimizations on code involving signed integers. Unsigned integer overflow is defined to wrap around, so the compiler won't be able to optimize as much. See also http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html#signed_overflow and http://www.airs.com/blog/archives/120.
From Instruction specification on AMD/Intel we have (for K7):
Instruction Ops Latency Throughput
DIV r32/m32 32 24 23
IDIV r32 81 41 41
IDIV m32 89 41 41
For i7, latency and throughput are the same for IDIVL
and DIVL
, a slight difference exists for the µops.
This may explain the difference as -O3 assembly codes only differ by signedness (DIVL vs IDIVL) on my machine.
Alternative wiki candidate test that may/may not show a significant time difference.
#include <stdio.h>
#include <time.h>
#define J 10
#define I 5
int main(void) {
clock_t c1,c2,c3;
for (int j=0; j<J; j++) {
c1 = clock();
for (int i=0; i<I; i++) {
isprime(294967241);
isprime(294367293);
}
c2 = clock();
for (int i=0; i<I; i++) {
isunsignedprime(294967241);
isunsignedprime(294367293);
}
c3 = clock();
printf("%d %d %d\n", (int)(c2-c1), (int)(c3-c2), (int)((c3-c2) - (c2-c1)));
fflush(stdout);
}
return 0;
}
Sample output
2761 2746 -15
2777 2777 0
2761 2745 -16
2793 2808 15
2792 2730 -62
2746 2730 -16
2746 2730 -16
2776 2793 17
2823 2808 -15
2793 2823 30
来源:https://stackoverflow.com/questions/34165099/in-c-why-is-signed-int-faster-than-unsigned-int