I\'ve had to do this many times in the past, and I\'ve never been satisfied with the results.
Can anyone suggest a fast way of copying a contiguous bit array fro
Your inner loop takes pieces of two bytes and moves them to a destination byte. That's almost optimal. Here are a few more hints in no particular order:
This is what I ended up doing. (EDIT Changed on 8/21/2014 for a single bit copy bug.)
#include <limits.h>
#include <string.h>
#include <stddef.h>
#define PREPARE_FIRST_COPY()                                      \
    do {                                                          \
    if (src_len >= (CHAR_BIT - dst_offset_modulo)) {              \
        *dst     &= reverse_mask[dst_offset_modulo];              \
        src_len -= CHAR_BIT - dst_offset_modulo;                  \
    } else {                                                      \
        *dst     &= reverse_mask[dst_offset_modulo]               \
              | reverse_mask_xor[dst_offset_modulo + src_len];    \
         c       &= reverse_mask[dst_offset_modulo + src_len];    \
        src_len = 0;                                              \
    } } while (0)
static void
bitarray_copy(const unsigned char *src_org, int src_offset, int src_len,
                    unsigned char *dst_org, int dst_offset)
{
    static const unsigned char mask[] =
        { 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
    static const unsigned char reverse_mask[] =
        { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
    static const unsigned char reverse_mask_xor[] =
        { 0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 };
    if (src_len) {
        const unsigned char *src;
              unsigned char *dst;
        int                  src_offset_modulo,
                             dst_offset_modulo;
        src = src_org + (src_offset / CHAR_BIT);
        dst = dst_org + (dst_offset / CHAR_BIT);
        src_offset_modulo = src_offset % CHAR_BIT;
        dst_offset_modulo = dst_offset % CHAR_BIT;
        if (src_offset_modulo == dst_offset_modulo) {
            int              byte_len;
            int              src_len_modulo;
            if (src_offset_modulo) {
                unsigned char   c;
                c = reverse_mask_xor[dst_offset_modulo]     & *src++;
                PREPARE_FIRST_COPY();
                *dst++ |= c;
            }
            byte_len = src_len / CHAR_BIT;
            src_len_modulo = src_len % CHAR_BIT;
            if (byte_len) {
                memcpy(dst, src, byte_len);
                src += byte_len;
                dst += byte_len;
            }
            if (src_len_modulo) {
                *dst     &= reverse_mask_xor[src_len_modulo];
                *dst |= reverse_mask[src_len_modulo]     & *src;
            }
        } else {
            int             bit_diff_ls,
                            bit_diff_rs;
            int             byte_len;
            int             src_len_modulo;
            unsigned char   c;
            /*
             * Begin: Line things up on destination. 
             */
            if (src_offset_modulo > dst_offset_modulo) {
                bit_diff_ls = src_offset_modulo - dst_offset_modulo;
                bit_diff_rs = CHAR_BIT - bit_diff_ls;
                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                c     &= reverse_mask_xor[dst_offset_modulo];
            } else {
                bit_diff_rs = dst_offset_modulo - src_offset_modulo;
                bit_diff_ls = CHAR_BIT - bit_diff_rs;
                c = *src >> bit_diff_rs     &
                    reverse_mask_xor[dst_offset_modulo];
            }
            PREPARE_FIRST_COPY();
            *dst++ |= c;
            /*
             * Middle: copy with only shifting the source. 
             */
            byte_len = src_len / CHAR_BIT;
            while (--byte_len >= 0) {
                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                *dst++ = c;
            }
            /*
             * End: copy the remaing bits; 
             */
            src_len_modulo = src_len % CHAR_BIT;
            if (src_len_modulo) {
                c = *src++ << bit_diff_ls;
                c |= *src >> bit_diff_rs;
                c     &= reverse_mask[src_len_modulo];
                *dst     &= reverse_mask_xor[src_len_modulo];
                *dst |= c;
            }
        }
    }
}
What is optimal will depend upon the target platform. On some platforms without barrel shifters, shifting the whole vector right or left one bit, n times, for n<3, will be the fastest approach (on the PIC18 platform, an 8x-unrolled byte loop to shift left one bit will cost 11 instruction cycles per eight bytes). Otherwise, I like the pattern (note src2 will have to be initialized depending upon what you want done with the end of your buffer)
src1 = *src++; src2 = (src1 shl shiftamount1) | (src2 shr shiftamount2); *dest++ = src2; src2 = *src++; src1 = (src2 shl shiftamount1) | (src1 shr shiftamount2); *dest++ = src1;
That should lend itself to very efficient implementation on an ARM (eight instructions every two words, if registers are available for src, dest, src1, src2, shiftamount1, and shiftamount2. Using more registers would allow faster operation via multi-word load/store instructions. Handling four words would be something like (one machine instruction per line, except the first four lines would together be one instruction, as would the last four lines ):
src0 = *src++; src1 = *src++; src2 = *src++; src3 = *src++; tmp = src0; src0 = src0 shr shiftamount1 src0 = src0 | src1 shl shiftamount2 src1 = src1 shr shiftamount1 src1 = src1 | src2 shl shiftamount2 src2 = src2 shr shiftamount1 src2 = src2 | src3 shl shiftamount2 src3 = src3 shr shiftamount1 src3 = src3 | tmp shl shiftamount2 *dest++ = src0; *dest++ = src1; *dest++ = src2; *dest++ = src3;
Eleven instructions per 16 bytes rotated.
Your solution looks similar to most I've seen: basically do some unaligned work at the start and end, with the main loop in the middle using aligned accesses. If you really need efficiency and do this on very long bitstreams, I would suggest using something architecture-specific like SSE2 in the main loop.