问题
I'm implementing a program that parses tape archives. Part of the parser logic is checking for an end-of-archive marker which is a 512-byte block full of NUL bytes. I wrote the following code for this purpose, expecting gcc to optimize this well:
int is_eof_block(const char usth[static 512])
{
size_t i;
for (i = 0; i < 512; i++)
if (usth[i] != '\0')
return 0;
return 1;
}
But to my surprise, gcc still generates terrible code for that, even though I explicitly allow it to access the whole 512 bytes in the buffer:
is_eof_block:
leaq 512(%rdi), %rax
jmp .L239
.p2align 4,,10
.L243:
addq $1, %rdi
cmpq %rax, %rdi
je .L242
.L239:
cmpb $0, (%rdi)
je .L243
xorl %eax, %eax
ret
.p2align 4,,10
.L242:
movl $1, %eax
ret
I expected gcc to generate something like this or even SIMD code:
is_eof_block:
mov $64,%ecx
xor %eax,%eax
repz scasq
setz %al
ret
How can I rewrite the code such that it is still portable (as in: does not use non-C99 language extensions and works on architectures that do not support misaligned memory access) but compiles to better machine code on common architectures such as amd64 and AArch32?
Benchmark
I wrote the following microbenchmark to demonstrate the time difference. You can define MISALIGNED
to a positive integer to test with misaligned buffers.
benchmark.c
#include <stdio.h>
#include <time.h>
#define TESTS 10000000
#ifndef MISALIGNED
# define MISALIGNED 0
#endif
char testarray[512 + MISALIGNED];
extern int is_eof_block(const char[static 512]);
int main()
{
size_t i, j;
clock_t begin, end;
fprintf(stderr, "testing %d times\n", TESTS);
fprintf(stderr, "no byte set to 1... ");
begin = clock();
for (i = 0; i < TESTS; i++)
if (!is_eof_block(testarray + MISALIGNED)) {
fprintf(stderr, "\nWrong test result in iteration %zu!\n", i);
return EXIT_FAILURE;
}
end = clock();
fprintf(stderr, "%fs\n", (end - begin) / (double)CLOCKS_PER_SEC);
fprintf(stderr, "with non-null byte... ");
begin = clock();
for (i = j = 0; i < TESTS; i++) {
testarray[MISALIGNED + j] = '\0';
j = (j + 47) & 511;
testarray[MISALIGNED + j] = '1';
if (is_eof_block(testarray + MISALIGNED)) {
fprintf(stderr, "\nWrong test result in iteration %zu!\n", i);
return EXIT_FAILURE;
}
}
end = clock();
fprintf(stderr, "%fs\n", (end - begin) / (double)CLOCKS_PER_SEC);
return EXIT_SUCCESS;
}
is_eof_block_c.c
#include <stddef.h>
int is_eof_block(const char test[static 512])
{
size_t i;
for (i = 0; i < 512; i++)
if (test[i] != '\0')
return 0;
return 1;
}
is_eof_block_asm.s
.text
.globl is_eof_block
.type is_eof_block,@function
.align 16
is_eof_block:
mov $64,%ecx
xor %eax,%eax
repz scasq
setz %al
ret
.size is_eof_block,.-is_eof_block
Here is the output with the C implementation of is_eof_block
linked in:
testing 10000000 times
no byte set to 1... 2.281250s
with non-null byte... 1.195312s
and here is the assembly version:
testing 10000000 times
no byte set to 1... 0.476562s
with non-null byte... 0.320312s
Both have been compiled with a gcc 5 with the sole optimization option being -O3
. Passing various -march=...
flags didn't change the code. The difference is about a factor of four. With a misaligned buffer, the assembly implementation is roughly 3% slower whereas there is no difference with the C implementation.
回答1:
Here's a version that touches every byte and seems to be 2-3x faster than the original function in your test harness (I'm not convinced it reflects reality accurately):
int
is_eof_block1(const char usth[static 512])
{
unsigned int i;
int res = 0;
for (i = 0; i < 512; i++)
res |= usth[i];
return res == 0;
}
Here's a version that optimizes for readability and not wasting peoples time and trying to outclever the people who wrote your compiler/libc (it's much faster than your assembler, at least on my machine):
int
is_eof_block2(const char usth[static 512])
{
const static char foo[512];
return !memcmp(usth, foo, sizeof(foo));
}
回答2:
Here is one version which (naively) believes that the compiler will do the best possible job if you give it one of the stdint.h _fast
types:
#include <stdint.h>
#include <stdio.h>
typedef uint_fast16_t fast_t; // 16 since 512 can't fit in 8 bits
#define FAST_SIZE (512/sizeof(fast_t))
typedef union // union to guarantee there's no aliasing mishaps
{
char usth [512];
fast_t fast [FAST_SIZE];
} block_t;
// misc sanity checks:
_Static_assert(512%sizeof(fast_t) == 0, "This should never happen");
_Static_assert(sizeof(block_t) == 512, "Padding gone crazy");
int is_eof_block(const block_t* block)
{
for(const fast_t* i=&block->fast[0]; i<block->fast+FAST_SIZE; i++)
{
if(*i != 0)
return 0;
}
return 1;
}
int main (void)
{
block_t block = {0};
printf("%d", is_eof_block(&block));
}
The loop can be replaced with array + iterator instead of pointer arithmetic. Might be faster or slower, I haven't benchmarked it.
EDIT:
Array + iterator version. Which is why I used uint_fast16_t
- I was hoping that "fast_t
" would do a better job than size_t
and then it has to be at least large enough to contain the value 512.
int is_eof_block(const block_t* block)
{
for(fast_t i=0; i<FAST_SIZE; i++)
{
if(block->fast[i] != 0)
return 0;
}
return 1;
}
回答3:
Since the block is known to be 512 bytes fetch each 16 byte group into an UInt64, then test against zero. That should cut down on the loop overhead.
A possible workaround for your alignment issue would be to copy the buffer into a local struct.
struct x
{
unsigned long long :0;
char buffer[512];
};
That would give you an aligned buffer to work with.
回答4:
Due to the genuinely helpful comments to the question, I have decided to go with the original C code. Thanks all of you for your help!
来源:https://stackoverflow.com/questions/35132492/how-to-get-gcc-to-generate-decent-code-that-checks-if-a-buffer-is-full-of-nul-by