atomicCAS for bool implementation

霸气de小男生 提交于 2020-06-09 05:49:12

问题


I'm trying to figure out is there a bug in the answer (now deleted) about the implementation of Cuda-like atomicCAS for bools. The code from the answer (reformatted):

static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
    unsigned long long addr = (unsigned long long)address;
    unsigned pos = addr & 3;  // byte position within the int
    int *int_addr = (int *)(addr - pos);  // int-aligned address
    int old = *int_addr, assumed, ival;

    do
    {
        assumed = old;
        if(val)
            ival = old | (1 << (8 * pos));
        else
            ival = old & (~((0xFFU) << (8 * pos)));
        old = atomicCAS(int_addr, assumed, ival);
    } while(assumed != old);

    return (bool)(old & ((0xFFU) << (8 * pos)));
}

According to the documentation, atomicCAS should set *address to (*address == compare ? val : *address), but in the implementation above compare argument is never used!

The code I use to reproduce the bug:

#include <cstdio>

// atomicCAS definition here

__device__ bool b;


__global__ void kernel()
{
    b = false;
    atomicCAS(&b, true, true); // `(b == true ? true : b)`, where b is false equals to false
    printf("%d\n", b); // b is false => expected output is 0
}


int main()
{
    kernel<<<1, 1>>>();
    cudaDeviceSynchronize();
}

The expected output is 0, but the actual output is 1.

I have a suggestion about how to fix it but am not 100% sure it's thread-safe:

static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
    unsigned long long addr = (unsigned long long)address;
    unsigned pos = addr & 3;  // byte position within the int
    int *int_addr = (int *)(addr - pos);  // int-aligned address
    int old = *int_addr, assumed, ival;

    do
    {
        if(*address != compare) // If we expected that bool to be different, then
            break; // stop trying to update it and just return it's current value

        assumed = old;
        if(val)
            ival = old | (1 << (8 * pos));
        else
            ival = old & (~((0xFFU) << (8 * pos)));
        old = atomicCAS(int_addr, assumed, ival);
    } while(assumed != old);

    return (bool)(old & ((0xFFU) << (8 * pos)));
}

My questions are

  1. Is there a bug in the first code sample from the answer? If there is,
  2. Does the last code sample fix it thread-safely?

回答1:


Many many thanks to @RobertCrovella; the first code sample does contain a bug, the second does fix it, but is not thread-safe (see question comments for details). The thread-safe fix:

static __inline__ __device__ bool atomicCAS(bool *address, bool compare, bool val)
{
    unsigned long long addr = (unsigned long long)address;
    unsigned pos = addr & 3;  // byte position within the int
    int *int_addr = (int *)(addr - pos);  // int-aligned address
    int old = *int_addr, assumed, ival;

    bool current_value;

    do
    {
        current_value = (bool)(old & ((0xFFU) << (8 * pos)));

        if(current_value != compare) // If we expected that bool to be different, then
            break; // stop trying to update it and just return it's current value

        assumed = old;
        if(val)
            ival = old | (1 << (8 * pos));
        else
            ival = old & (~((0xFFU) << (8 * pos)));
        old = atomicCAS(int_addr, assumed, ival);
    } while(assumed != old);

    return current_value;
}


来源:https://stackoverflow.com/questions/62091548/atomiccas-for-bool-implementation

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!