问题
I use NVIDIA Visual Profiler to analyze my code. The test kernels are:
//////////////////////////////////////////////////////////////// Group 1
static __global__ void gpu_test_divergency_0(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_1(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_2(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
static __global__ void gpu_test_divergency_3(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid;
}
else
{
b[tid] = tid;
}
}
//////////////////////////////////////////////////////////////// Group 2
static __global__ void gpu_test_divergency_4(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_5(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_6(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
static __global__ void gpu_test_divergency_7(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid + 1;
}
else
{
b[tid] = tid + 2;
}
}
//////////////////////////////////////////////////////////////// Group 3
static __global__ void gpu_test_divergency_8(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_9(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid == 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_10(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid >= 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
static __global__ void gpu_test_divergency_11(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid > 0)
{
a[tid] = tid + 1.0;
}
else
{
b[tid] = tid + 2.0;
}
}
When I launched the test kernels with <<< 1, 32 >>>, I got the results from profiler like this:
gpu_test_divergency_0 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_1 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_2 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_3 : Branch Efficiency = 100% branch = 1 divergent branch = 0
gpu_test_divergency_4 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_5 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_6 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_7 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_8 : Branch Efficiency = 100% branch = 3 divergent branch = 0
gpu_test_divergency_9 : Branch Efficiency = 75% branch = 4 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 75% branch = 4 divergent branch = 1
And when I launched the test kernels with <<< 1, 64 >>>, I got the results from profiler like this:
gpu_test_divergency_0 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_1 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_2 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_3 : Branch Efficiency = 100% branch = 2 divergent branch = 0
gpu_test_divergency_4 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_5 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_6 : Branch Efficiency = 100% branch = 4 divergent branch = 0
gpu_test_divergency_7 : Branch Efficiency = 100% branch = 5 divergent branch = 0
gpu_test_divergency_8 : Branch Efficiency = 100% branch = 6 divergent branch = 0
gpu_test_divergency_9 : Branch Efficiency = 85.7% branch = 7 divergent branch = 1
gpu_test_divergency_10 : Branch Efficiency = 100% branch = 4 divergent branch = 0
gpu_test_divergency_11 : Branch Efficiency = 83.3% branch = 6 divergent branch = 1
I use "GeForce GTX 570" with the CUDA Capability of 2.0 and NVIDIA Visual Profiler v4.2 on Linux. According to the documents:
"branch" - "Number of branches taken by threads executing a kernel. This counter will be incremented by one if at least one thread in a warp takes the branch."
"divergent branch" - "Number of divergent branches within a warp. This counter will be incremented by one if at least one tread in a warp diverges (that is, follows a different execution path) via a data dependent conditional branch."
But I am really confused about the results. Why the numbers of "branch" for each test group are different? And why only the third test group seems to have the right "divergent branch"?
@JackOLantern: I compiled in release mode. I disassembled it in your way. The results of "gpu_test_divergency_4" is exactly the same as yours but the result of "gpu_test_divergency_0" is different:
Function : _Z21gpu_test_divergency_0PfS_
/*0000*/ /*0x00005de428004404*/ MOV R1, c [0x1] [0x100];
/*0008*/ /*0x94001c042c000000*/ S2R R0, SR_CTAid_X;
/*0010*/ /*0x84009c042c000000*/ S2R R2, SR_Tid_X;
/*0018*/ /*0x20009ca320044000*/ IMAD R2, R0, c [0x0] [0x8], R2;
/*0020*/ /*0xfc21dc23188e0000*/ ISETP.LT.AND P0, pt, R2, RZ, pt;
/*0028*/ /*0x0920de0418000000*/ I2F.F32.S32 R3, R2;
/*0030*/ /*0x9020204340004000*/ @!P0 ISCADD R0, R2, c [0x0] [0x24], 0x2;
/*0038*/ /*0x8020804340004000*/ @P0 ISCADD R2, R2, c [0x0] [0x20], 0x2;
/*0040*/ /*0x0000e08590000000*/ @!P0 ST [R0], R3;
/*0048*/ /*0x0020c08590000000*/ @P0 ST [R2], R3;
/*0050*/ /*0x00001de780000000*/ EXIT;
I guess, like you said, conversion instructions (I2F in this case) do not add extra branch.
But I cannot see the relationship between these disassembled code and the Profiler results. I learned from another post (https://devtalk.nvidia.com/default/topic/463316/branch-divergent-branches/) that divergent branch is calculated with the actual thread(warp) running situation on SMs. So I guess we cannot deduce the branch divergence of each actual running, just according to these disassembled code. Am I right?
回答1:
FOLLOW UP - USING VOTE INTRINSICS TO CHECK THREAD DIVERGENCE
I think the best way to check about thread divergence within warps is using vote intrinsics and in particular the __ballot
and __popc
intrinsics. A good explanation on __ballot
and __popc
is available in the book by Shane Cook, CUDA Programming, Morgan Kaufmann.
The prototype of __ballot
is the following
unsigned int __ballot(int predicate);
If predicate is nonzero, __ballot
returns a value with the N
th bit set, where N
is threadIdx.x
.
On the other side, __popc
returns the number of bits set withing a 32
-bit parameter.
So, by jointly using __ballot
, __popc
and atomicAdd
, one can check if a warp is divergent or not.
To this end, I have set up the following code
#include <cuda.h>
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
__device__ unsigned int __ballot_non_atom(int predicate)
{
if (predicate != 0) return (1 << (threadIdx.x % 32));
else return 0;
}
__global__ void gpu_test_divergency_0(unsigned int* d_ballot, int Num_Warps_per_Block)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> 5;
atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot_non_atom(tid > 2)));
// atomicAdd(&d_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid > 2)));
}
#include <conio.h>
int main(int argc, char *argv[])
{
unsigned int Num_Threads_per_Block = 64;
unsigned int Num_Blocks_per_Grid = 1;
unsigned int Num_Warps_per_Block = Num_Threads_per_Block/32;
unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/32;
unsigned int* h_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int));
unsigned int* d_ballot; cudaMalloc((void**)&d_ballot, Num_Warps_per_Grid*sizeof(unsigned int));
for (int i=0; i<Num_Warps_per_Grid; i++) h_ballot[i] = 0;
cudaMemcpy(d_ballot, h_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyHostToDevice);
gpu_test_divergency_0<<<Num_Blocks_per_Grid,Num_Threads_per_Block>>>(d_ballot,Num_Warps_per_Block);
cudaMemcpy(h_ballot, d_ballot, Num_Warps_per_Grid*sizeof(unsigned int), cudaMemcpyDeviceToHost);
for (int i=0; i<Num_Warps_per_Grid; i++) {
if ((h_ballot[i] == 0)||(h_ballot[i] == 32)) std::cout << "Warp " << i << " IS NOT divergent- Predicate true for " << h_ballot[i] << " threads\n";
else std::cout << "Warp " << i << " IS divergent - Predicate true for " << h_ballot[i] << " threads\n";
}
getch();
return EXIT_SUCCESS;
}
Please, note that I'm right now running the code on a compute capability 1.2 card, so in the example above I'm using __ballot_non_atom
which is a non-intrinsic equivalent to __ballot
, since __ballot
is available only for compute capability >= 2.0. In other words, if you have a card with compute capability >= 2.0, please uncommented the instruction using __ballot
in the kernel function.
With the above code, you can play with all your kernel functions above by simply changing the relevant predicate in the kernel function.
PREVIOUS ANSWER
I compiled your code for a compute capability 2.0
under release mode and I used -keep
to retain intermediate files and the cuobjdump
utility to produce the disassembly of two of your kernels, namely:
static __global__ void gpu_test_divergency_0(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0) a[tid] = tid;
else b[tid] = tid;
}
and
static __global__ void gpu_test_divergency_4(float *a, float *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < 0) a[tid] = tid + 1;
else b[tid] = tid + 2;
}
The results are the following
gpu_test_divergency_0
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */
/*0018*/ IMAD R2, R0, c[0x0][0x8], R2; /* 0x2004400020009ca3 */
/*0020*/ ISETP.LT.AND P0, PT, R2, RZ, PT; /* 0x188e0000fc21dc23 */
/*0028*/ I2F.F32.S32 R0, R2; /* 0x1800000009201e04 */
/*0030*/ @!P0 ISCADD R3, R2, c[0x0][0x24], 0x2; /* 0x400040009020e043 */
/*0038*/ @P0 ISCADD R2, R2, c[0x0][0x20], 0x2; /* 0x4000400080208043 */
/*0040*/ @!P0 ST [R3], R0; /* 0x9000000000302085 */
/*0048*/ @P0 ST [R2], R0; /* 0x9000000000200085 */
/*0050*/ EXIT ; /* 0x8000000000001de7 */
and
gpu_test_divergency_4
/*0000*/ MOV R1, c[0x1][0x100]; /* 0x2800440400005de4 */
/*0008*/ S2R R0, SR_CTAID.X; /* 0x2c00000094001c04 */ R0 = BlockIdx.x
/*0010*/ S2R R2, SR_TID.X; /* 0x2c00000084009c04 */ R2 = ThreadIdx.x
/*0018*/ IMAD R0, R0, c[0x0][0x8], R2; /* 0x2004400020001ca3 */ R0 = R0 * c + R2
/*0020*/ ISETP.LT.AND P0, PT, R0, RZ, PT; /* 0x188e0000fc01dc23 */ If statement
/*0028*/ @P0 BRA.U 0x58; /* 0x40000000a00081e7 */ Branch 1 - Jump to 0x58
/*0030*/ @!P0 IADD R2, R0, 0x2; /* 0x4800c0000800a003 */ Branch 2 - R2 = R0 + 2
/*0038*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2; /* 0x4000400090002043 */ Branch 2 - Calculate gmem address
/*0040*/ @!P0 I2F.F32.S32 R2, R2; /* 0x180000000920a204 */ Branch 2 - R2 = R2 after int to float cast
/*0048*/ @!P0 ST [R0], R2; /* 0x900000000000a085 */ Branch 2 - gmem store
/*0050*/ @!P0 BRA.U 0x78; /* 0x400000008000a1e7 */ Branch 2 - Jump to 0x78 (exit)
/*0058*/ @P0 IADD R2, R0, 0x1; /* 0x4800c00004008003 */ Branch 1 - R2 = R0 + 1
/*0060*/ @P0 ISCADD R0, R0, c[0x0][0x20], 0x2; /* 0x4000400080000043 */ Branch 1 - Calculate gmem address
/*0068*/ @P0 I2F.F32.S32 R2, R2; /* 0x1800000009208204 */ Branch 1 - R2 = R2 after int to float cast
/*0070*/ @P0 ST [R0], R2; /* 0x9000000000008085 */ Branch 1 - gmem store
/*0078*/ EXIT ; /* 0x8000000000001de7 */
From the above disassemblies, I would expect that the results of your branch divergency tests be the same.
Are you compiling in a debug or release mode?
来源:https://stackoverflow.com/questions/19334589/cuda-confusion-about-the-visual-profiler-results-of-branch-and-divergent-br