-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmergeSortCUDA.cu
162 lines (133 loc) · 4.62 KB
/
mergeSortCUDA.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include <iostream>
#include <sys/time.h>
/**
* mergesort.cu
* a one-file c++ / cuda program for performing mergesort on the GPU
* While the program execution is fairly slow, most of its runnning time
* is spent allocating memory on the GPU.
* For a more complex program that performs many calculations,
* running on the GPU may provide a significant boost in performance
*/
// helper for main()
long readList(long**);
// data[], size, threads, blocks,
void mergesort(long*, long, dim3, dim3);
// A[]. B[], size, width, slices, nThreads
__global__ void gpu_mergesort(long*, long*, long, long, long, dim3*, dim3*);
__device__ void gpu_bottomUpMerge(long*, long*, long, long, long);
#define min(a, b) (a < b ? a : b)
int main(int argc, char** argv) {
dim3 threadsPerBlock;
dim3 blocksPerGrid;
threadsPerBlock.x = 16;
threadsPerBlock.y = 1;
threadsPerBlock.z = 1;
blocksPerGrid.x = 1;
blocksPerGrid.y = 1;
blocksPerGrid.z = 1;
long* data;
long size = 64;
data = (long *)malloc(size*sizeof(long));
for (int i = 0; i < size; ++i) {
data[i] = rand() % 1000;
}
std::cout<<"Initial array: ";
for (int i = 0; i < size; i++) {
std::cout << data[i] << ' ';
}
std::cout<<"\n\n";
mergesort(data, size, threadsPerBlock, blocksPerGrid);
std::cout<<"Output: ";
for (int i = 0; i < size; i++) {
std::cout << data[i] << ' ';
}
std::cout<<'\n';
}
void mergesort(long* data, long size, dim3 threadsPerBlock, dim3 blocksPerGrid) {
//
// Allocate two arrays on the GPU
// we switch back and forth between them during the sort
//
long* D_data;
long* D_swp;
dim3* D_threads;
dim3* D_blocks;
// Actually allocate the two arrays
cudaMalloc((void**) &D_data, size * sizeof(long));
cudaMalloc((void**) &D_swp, size * sizeof(long));
// Copy from our input list into the first array
cudaMemcpy(D_data, data, size * sizeof(long), cudaMemcpyHostToDevice);
//
// Copy the thread / block info to the GPU as well
//
cudaMalloc((void**) &D_threads, sizeof(dim3));
cudaMalloc((void**) &D_blocks, sizeof(dim3));
cudaMemcpy(D_threads, &threadsPerBlock, sizeof(dim3), cudaMemcpyHostToDevice);
cudaMemcpy(D_blocks, &blocksPerGrid, sizeof(dim3), cudaMemcpyHostToDevice);
long* A = D_data;
long* B = D_swp;
long nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z *
blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;
//
// Slice up the list and give pieces of it to each thread, letting the pieces grow
// bigger and bigger until the whole list is sorted
//
for (int width = 2; width < (size << 1); width <<= 1) {
long slices = size / ((nThreads) * width) + 1;
// Actually call the kernel
gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
cudaDeviceSynchronize();
// Switch the input / output arrays instead of copying them around
A = A == D_data ? D_swp : D_data;
B = B == D_data ? D_swp : D_data;
}
cudaMemcpy(data, A, size * sizeof(long), cudaMemcpyDeviceToHost);
// Free the GPU memory
cudaFree(A);
cudaFree(B);
}
// GPU helper function
// calculate the id of the current thread
__device__ unsigned int getIdx(dim3* threads, dim3* blocks) {
int x;
return threadIdx.x +
threadIdx.y * (x = threads->x) +
threadIdx.z * (x *= threads->y) +
blockIdx.x * (x *= threads->z) +
blockIdx.y * (x *= blocks->z) +
blockIdx.z * (x *= blocks->y);
}
//
// Perform a full mergesort on our section of the data.
//
__global__ void gpu_mergesort(long* source, long* dest, long size, long width, long slices, dim3* threads, dim3* blocks) {
unsigned int idx = getIdx(threads, blocks);
long start = width*idx*slices,
middle,
end;
for (long slice = 0; slice < slices; slice++) {
if (start >= size)
break;
middle = min(start + (width >> 1), size);
end = min(start + width, size);
gpu_bottomUpMerge(source, dest, start, middle, end);
start += width;
}
}
//
// Finally, sort something
// gets called by gpu_mergesort() for each slice
//
__device__ void gpu_bottomUpMerge(long* source, long* dest, long start, long middle, long end) {
long i = start;
long j = middle;
for (long k = start; k < end; k++) {
if (i < middle && (j >= end || source[i] < source[j])) {
dest[k] = source[i];
i++;
} else {
dest[k] = source[j];
j++;
}
}
}