-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsync2.cu
135 lines (106 loc) · 3.12 KB
/
sync2.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "repeat.h"
__global__ void ksync2_test4 (unsigned int *ts, unsigned int* out, int p1, int p2, int its)
{
unsigned int t1 = p1;
unsigned int t2 = p2;
unsigned int start_time = 0 , stop_time = 0, start_time2 = 0, stop_time2 = 0;
unsigned int tid = (blockIdx.x*blockDim.x + threadIdx.x);
unsigned int tid2 = tid%32;
__shared__ volatile unsigned int count[32];
__shared__ volatile unsigned int count2[32];
count[tid2] = 0;
count2[tid2] = 0;
__syncthreads();
if (tid/32 == 0)
{
if (tid < 16)
{
start_time = clock();
__syncthreads();
stop_time = clock();
repeat16(t1&=t2;t2&=t1;)
count2[tid2] = count[tid];
}
else
{
repeat16(t1|=t2;t2|=t1;)
start_time = clock();
__syncthreads();
stop_time = clock();
count2[tid2] = count[tid];
}
}
else
{
repeat32(t1+=t2;t2+=t1;)
count[tid2] = tid;
start_time = clock();
__syncthreads();
stop_time = clock();
repeat32(t1+=t2;t2+=t1;)
count[tid2] = tid + 100;
start_time2 = clock();
__syncthreads();
stop_time2 = clock();
}
out[0] = (t1 + t2);
ts[(blockIdx.x*blockDim.x + threadIdx.x)*3] = count2[tid2];
ts[(blockIdx.x*blockDim.x + threadIdx.x)*3 + 1] = (threadIdx.x&1) ? start_time2 : start_time;
ts[(blockIdx.x*blockDim.x + threadIdx.x)*3 + 2] = (threadIdx.x&1) ? stop_time2 : stop_time;
}
void measure_sync2()
{
unsigned int ts[1024]; // ts, output from kernel. Two elements used per thread.
unsigned int *d_ts;
unsigned int *d_out; // Unused memory for storing output
//run two warps
dim3 Db = dim3(32 * 2);
dim3 Dg = dim3(1,1,1);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int warpsize = deviceProp.warpSize;
// Allocate device array.
cudaError_t errcode;
if (cudaSuccess != (errcode = cudaMalloc((void**)&d_ts, sizeof(ts))))
{
printf ("cudaMalloc failed %s:%d\n", __FILE__, __LINE__);
printf (" %s\n", cudaGetErrorString(errcode));
return;
}
if (cudaSuccess != cudaMalloc((void**)&d_out, 4))
{
printf ("cudaMalloc failed %s:%d\n", __FILE__, __LINE__);
return;
}
cudaGetLastError();
printf ("Running __syncthreads() tests...\n");
Db.x = warpsize * 2;
printf("\nksync2_test4: Shows syncthreads in two warps, one of which is diverged.");
ksync2_test4 <<<Dg, Db>>>(d_ts, d_out, 4, 6, 2);
cudaThreadSynchronize();
cudaMemcpy(ts, d_ts, sizeof(ts), cudaMemcpyDeviceToHost);
printf("\n count[0..15]: {");
for (int i = 0; i < warpsize/2 - 1; i++)
{
printf("%3d, ", ts[i*3]);
}
printf("%3d}\n", ts[15 * 3]);
printf(" count[16..31]: {");
for (int i = 16; i < warpsize - 1; i++)
{
printf("%3d, ", ts[i*3]);
}
printf("%3d}\n", ts[31 * 3]);
printf (" Warp 0 thread 0 sync time: %5d - %5d\n", ts[0*3+1], ts[0*3+2]);
printf (" Warp 0 thread 16 sync time: %5d - %5d\n", ts[16*3+1], ts[16*3+2]);
printf (" Warp 1 thread 0 first sync time: %5d - %5d\n", ts[32*3+1], ts[32*3+2]);
printf (" Warp 1 thread 1 second sync time: %5d - %5d\n", ts[33*3+1], ts[33*3+2]);
if (cudaSuccess != cudaGetLastError())
printf("failed.\n");
cudaFree(d_ts);
cudaFree(d_out);
}