How to get gcc to generate decent code that checks if a buffer is full of NUL bytes?

问题

I'm implementing a program that parses tape archives. Part of the parser logic is checking for an end-of-archive marker which is a 512-byte block full of NUL bytes. I wrote the following code for this purpose, expecting gcc to optimize this well:

int is_eof_block(const char usth[static 512])
{
    size_t i;

    for (i = 0; i < 512; i++)
        if (usth[i] != '\0')
            return 0;

    return 1;
}

But to my surprise, gcc still generates terrible code for that, even though I explicitly allow it to access the whole 512 bytes in the buffer:

is_eof_block:
    leaq    512(%rdi), %rax
    jmp .L239
    .p2align 4,,10
.L243:
    addq    $1, %rdi
    cmpq    %rax, %rdi
    je  .L242
.L239:
    cmpb    $0, (%rdi)
    je  .L243
    xorl    %eax, %eax
    ret
    .p2align 4,,10
.L242:
    movl    $1, %eax
    ret

I expected gcc to generate something like this or even SIMD code:

is_eof_block:
    mov $64,%ecx
    xor %eax,%eax
    repz scasq
    setz %al
    ret

How can I rewrite the code such that it is still portable (as in: does not use non-C99 language extensions and works on architectures that do not support misaligned memory access) but compiles to better machine code on common architectures such as amd64 and AArch32?

Benchmark

I wrote the following microbenchmark to demonstrate the time difference. You can define MISALIGNED to a positive integer to test with misaligned buffers.

benchmark.c

#include <stdio.h>
#include <time.h>

#define TESTS 10000000
#ifndef MISALIGNED
# define MISALIGNED 0
#endif

char testarray[512 + MISALIGNED];

extern int is_eof_block(const char[static 512]);

int main()
{
    size_t i, j;
    clock_t begin, end;

    fprintf(stderr, "testing %d times\n", TESTS);
    fprintf(stderr, "no byte set to 1... ");
    begin = clock();

    for (i = 0; i < TESTS; i++)
        if (!is_eof_block(testarray + MISALIGNED)) {
            fprintf(stderr, "\nWrong test result in iteration %zu!\n", i);
            return EXIT_FAILURE;
        }

    end = clock();
    fprintf(stderr, "%fs\n", (end - begin) / (double)CLOCKS_PER_SEC);

    fprintf(stderr, "with non-null byte... ");
    begin = clock();

    for (i = j = 0; i < TESTS; i++) {
        testarray[MISALIGNED + j] = '\0';
        j = (j + 47) & 511;
        testarray[MISALIGNED + j] = '1';

        if (is_eof_block(testarray + MISALIGNED)) {
            fprintf(stderr, "\nWrong test result in iteration %zu!\n", i);
            return EXIT_FAILURE;
        }       
    }

    end = clock();
    fprintf(stderr, "%fs\n", (end - begin) / (double)CLOCKS_PER_SEC);

    return EXIT_SUCCESS;
}

is_eof_block_c.c

#include <stddef.h>

int is_eof_block(const char test[static 512])
{
    size_t i;

    for (i = 0; i < 512; i++)
        if (test[i] != '\0')
            return 0;

    return 1;
}

is_eof_block_asm.s

    .text
    .globl is_eof_block
    .type is_eof_block,@function

    .align 16
is_eof_block:
    mov $64,%ecx
    xor %eax,%eax
    repz scasq
    setz %al
    ret
    .size is_eof_block,.-is_eof_block

Here is the output with the C implementation of is_eof_block linked in:

testing 10000000 times
no byte set to 1... 2.281250s
with non-null byte... 1.195312s

and here is the assembly version:

testing 10000000 times
no byte set to 1... 0.476562s
with non-null byte... 0.320312s

Both have been compiled with a gcc 5 with the sole optimization option being -O3. Passing various -march=... flags didn't change the code. The difference is about a factor of four. With a misaligned buffer, the assembly implementation is roughly 3% slower whereas there is no difference with the C implementation.

回答1:

Here's a version that touches every byte and seems to be 2-3x faster than the original function in your test harness (I'm not convinced it reflects reality accurately):

int
is_eof_block1(const char usth[static 512])
{
        unsigned int i;
        int res = 0;
        for (i = 0; i < 512; i++)
                res |= usth[i];
        return res == 0;
}

Here's a version that optimizes for readability and not wasting peoples time and trying to outclever the people who wrote your compiler/libc (it's much faster than your assembler, at least on my machine):

int
is_eof_block2(const char usth[static 512])
{
        const static char foo[512];
        return !memcmp(usth, foo, sizeof(foo));
}

回答2:

Here is one version which (naively) believes that the compiler will do the best possible job if you give it one of the stdint.h _fast types:

#include <stdint.h>
#include <stdio.h>

typedef uint_fast16_t fast_t; // 16 since 512 can't fit in 8 bits

#define FAST_SIZE (512/sizeof(fast_t))

typedef union  // union to guarantee there's no aliasing mishaps
{
  char   usth [512];
  fast_t fast [FAST_SIZE];

} block_t;


// misc sanity checks:
_Static_assert(512%sizeof(fast_t) == 0, "This should never happen");
_Static_assert(sizeof(block_t) == 512,  "Padding gone crazy");


int is_eof_block(const block_t* block)
{
  for(const fast_t* i=&block->fast[0]; i<block->fast+FAST_SIZE; i++)
  {
    if(*i != 0)
      return 0;
  }

  return 1;
}


int main (void)
{
  block_t block = {0};

  printf("%d", is_eof_block(&block));
}

The loop can be replaced with array + iterator instead of pointer arithmetic. Might be faster or slower, I haven't benchmarked it.

EDIT:

Array + iterator version. Which is why I used uint_fast16_t - I was hoping that "fast_t" would do a better job than size_t and then it has to be at least large enough to contain the value 512.

int is_eof_block(const block_t* block)
{
  for(fast_t i=0; i<FAST_SIZE; i++)
  {
    if(block->fast[i] != 0)
      return 0;
  }

  return 1;
}

回答3:

Since the block is known to be 512 bytes fetch each 16 byte group into an UInt64, then test against zero. That should cut down on the loop overhead.

A possible workaround for your alignment issue would be to copy the buffer into a local struct.

struct x
{
    unsigned long long :0;

    char buffer[512];
};

That would give you an aligned buffer to work with.

回答4:

Due to the genuinely helpful comments to the question, I have decided to go with the original C code. Thanks all of you for your help!

来源：https://stackoverflow.com/questions/35132492/how-to-get-gcc-to-generate-decent-code-that-checks-if-a-buffer-is-full-of-nul-by

标签

gcc

x86

micro-optimization