Radix Sort Optimization

拟墨画扇 提交于 2019-11-29 08:35:58

The edit 4 version is good enough if the original and temp arrays fit in cache. If the array size is much greater than cache size, most of the overhead is due to the random order writes to the arrays. A hybrid msb/lsb radix sort can avoid this issue. For example split the array into 256 bins according to the most significant byte, then do a lsb radix sort on each of the 256 bins. The idea here is that a pair (original and temp) of bins will fit within the cache, where random order writes are not an issue (for most cache implementations).

For a 8MB cache, the goal is for each of the bins to be < 4MB in size = 1 million 32 bit integers if the integers evenly distribute into the bins. This strategy would work for array size up to 256 million 32 bit integers. For larger arrays, the msb phase could split up the array into 1024 bins, for up to 1 billion 32 bit integers. On my system, sorting 16,777,216 (2^24) 32 bit integers with a classic 8,8,8,8 lsb radix sort took 0.45 seconds, while the hybrid 8 msb : 8,8,8 lsb took 0.24 seconds.

// split array into 256 bins according to most significant byte
void RadixSort(uint32_t * a, size_t count)
{
size_t aIndex[260] = {0};               // count / array
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i;
    for(i = 0; i < count; i++)          // generate histogram
        aIndex[1+((size_t)(a[i] >> 24))]++;
    for(i = 2; i < 257; i++)            // convert to indices
        aIndex[i] += aIndex[i-1];
    for(i = 0; i < count; i++)          //  sort by msb
        b[aIndex[a[i]>>24]++] = a[i];
    for(i = 256; i; i--)                // restore aIndex
        aIndex[i] = aIndex[i-1];
    aIndex[0] = 0;
    for(i = 0; i < 256; i++)            // radix sort the 256 bins
        RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
    delete[] b;
}

// sort a bin by 3 least significant bytes
void RadixSort3(uint32_t * a, uint32_t *b, size_t count)
{
size_t mIndex[3][256] = {0};            // count / matrix
size_t i,j,m,n;
uint32_t u;
    if(count == 0)
        return;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 3; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 3; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 3; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
}

Example code for classic lsb radix sorts:

Example C++ lsb radix sort using 8,8,8,8 bit fields:

typedef unsigned int uint32_t;

void RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[4][256] = {0};            // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 4; j++){
            mIndex[j][(size_t)(u & 0xff)]++;
            u >>= 8;
        }       
    }
    for(j = 0; j < 4; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 256; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }       
    }
    for(j = 0; j < 4; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<3))&0xff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
}

Example C++ code using 16,16 bit fields:

typedef unsigned int uint32_t;

uint32_t * RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[2][65536] = {0};          // count / index matrix
uint32_t * b = new uint32_t [count];    // allocate temp array
size_t i,j,m,n;
uint32_t u;
    for(i = 0; i < count; i++){         // generate histograms
        u = a[i];
        for(j = 0; j < 2; j++){
            mIndex[j][(size_t)(u & 0xffff)]++;
            u >>= 16;
        }
    }
    for(j = 0; j < 2; j++){             // convert to indices
        m = 0;
        for(i = 0; i < 65536; i++){
            n = mIndex[j][i];
            mIndex[j][i] = m;
            m += n;
        }
    }       
    for(j = 0; j < 2; j++){             // radix sort
        for(i = 0; i < count; i++){     //  sort by current lsb
            u = a[i];
            m = (size_t)(u>>(j<<4))&0xffff;
            b[mIndex[j][m]++] = u;
        }
        std::swap(a, b);                //  swap ptrs
    }
    delete[] b;
    return(a);
}

N & 15 , N & 31 , N & 63 .... and so on , which of these bitwise operations takes least time?

They are same. Do not take it bad, but optimizing for speed without knowing how long things last may end up quite bad. And even when you know the timing, hardware is very complicated nowadays and quite unpredictable. You program in java, that is another layer of insanely complex system. The same code may be faster today and slower tomorrow. Your say approximately 2.232891909840167 times faster. In reality, you have measurement on one hardware and software configuration with one set of data and you can only hope the measurement is representative enough. Unfortunately, it is not always the case.

I rewrote your function. It is shorter and simpler, yet does not seem to be slower. Compilers tend to like code that is not too clever, as there are many optimizations for simple cases. The correction for negative numbers is not particulary nice, you can delete it if you do not like it. It seems to work best for 8 bits and 11 bits, probably due to cache sizes, have a look at comments of rcgldr.

EDIT

@ytoamn you are right, if all is in the first bucket the loop should continue, not break. That was a bug. To the other changes, I would rather avoid the contract you have done now. I think there are three natural contracts for sorting function. First one is sorting the original array and returning null. Second is sorting the original array and return it. The third is returning new sorted array and keeping the original array intact. I like the first one, as its behaviour is unambiguous. The way you have it now you should add big warning to the documentation, that the original array has changed and is returned from the function is some cases and in other not. Second thing I would avoid is the old C code style. You should define loop variable in the loop if you need it only there. Defining it globally injects dependency that may lead to bugs. And it has no advantages here, as properly defined loop variables would share the space in the end anyway. Compiler is well aware of the scope, you should use the smallest scope you need.

EDIT2

Feel free to comment directly under my post :-) Local variables are just addresses on the stack. You allocate memory when constructing object which is not the case here. As for the array, think about this code:

public static void Tst(int[] A) {
    int[] tmp = new int[A.length];
    A[0] = 6;
    A = tmp; // changes what parameter A contains
    A[0] = 7;
}

public static void main(String[] args) {
    int[] A = new int[1];
    A[0] = 5;
    Tst(A);
    System.out.println(A[0]); //prints 6
}

It prints 6. Number 7 is written into tmp array only. Array A in main is not affected.

protected static void ASC2(int A[], int bits) {
    int[] origA = A;
    int[] tmp = new int[A.length];
    int[] Z = new int[1 << bits];
    int mask = (1 << bits) - 1;

    for (int shift = 0; shift < 32; shift += bits) {

        if (shift > 0) {
            Arrays.fill(Z, 0);
        }
        for (int i = 0; i < A.length; ++i) {
            Z[(A[i] >> shift) & mask]++;
        }

        if (Z[0] == A.length) {
            continue; // all in first bucket
        }

        Z[Z.length - 1] = A.length - Z[Z.length - 1];
        for (int i = Z.length - 2; i >= 0; --i) {
            Z[i] = Z[i + 1] - Z[i];
        }

        if (shift + bits > 31) { // negative numbers correction
            int halfLength = Z.length / 2;
            int positSum = Z[halfLength];
            int negSum = A.length - positSum;
            if (negSum > 0) {
                for (int i = 0; i < halfLength; ++i) {
                    Z[i] += negSum;
                }
                for (int i = halfLength; i < Z.length; ++i) {
                    Z[i] -= positSum;
                }
            }
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> shift) & mask]++] = A[i];
        }

        int[] swap = A;
        A = tmp;
        tmp = swap;
    }

    if (A != origA) {
        System.arraycopy(A, 0, origA, 0, A.length);
    }
}

EDIT3

Loop unroll is a valid technique, improving short circuiting is really nice. But with using array lengths as constants you definitely start to be too clever. If you hard coded the base size, why not hard code it all like this:

protected static int[] DSC2(int A[])// sorts in descending order
{
    int tmp[] = new int[A.length];
    int Z[] = new int[256];
    int sample, swap[];

    // 1st LSB byte extraction
    sample = A[0] & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[A[i] & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[A[i] & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 2nd LSB byte extraction
    sample = (A[0] >> 8) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 8) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 8) & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 3rd LSB byte extraction
    sample = (A[0] >> 16) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 16) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 16) & 255]++] = A[i];
        }

        swap = A;
        A = tmp;
        tmp = swap;
        Arrays.fill(Z, 0);
    } else {
        Z[sample] = 0;
    }

    // 4th LSB byte extraction
    sample = (A[0] >> 24) & 255;
    for (int i = 0; i < A.length; ++i) {
        Z[(A[i] >> 24) & 255]++;
    }

    if (Z[sample] != A.length) {
        Z[0] = A.length - Z[0];
        for (int i = 1; i < Z.length; ++i) {
            Z[i] = Z[i - 1] - Z[i];
        }

        for (int i = 0; i < A.length; ++i) {
            tmp[Z[(A[i] >> 24) & 255]++] = A[i];
        }

        A = tmp;
    }

    return A;
}
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!