Insight into the first argument mask in __shfl__sync()
问题 Here is the test code for broadcasting variable: #include <stdio.h> #include <cuda_runtime.h> __global__ void broadcast(){ int lane_id = threadIdx.x & 0x1f; int value = 31 - lane_id; //let all lanes within the warp be broadcasted the value //whose laneID is 2 less than that of current lane int broadcasted_value = __shfl_up_sync(0xffffffff, value, 2) value = n; printf("thread %d final value = %d\n", threadIdx.x, value); } int main() { broadcast<<<1,32>>>(); cudaDeviceSynchronize(); return 0; }