-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbmm.cu
70 lines (56 loc) · 1.72 KB
/
bmm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
//ONLY MODIFY THIS FILE!
//YOU CAN MODIFY EVERYTHING IN THIS FILE!
#include "bmm.h"
#define tx threadIdx.x
#define ty threadIdx.y
#define tz threadIdx.z
#define bx blockIdx.x
#define by blockIdx.y
#define bz blockIdx.z
// TILEX and TILEY are used to set the number of threads in a CUDA block
#define TILEX 32
#define TILEY 16
#if (TILEX > TILEY)
#define TILEMAX TILEX
#define TILEMIN TILEY
#else
#define TILEMAX TILEY
#define TILEMIN TILEX
#endif // (TILEX > TILEY)
// you may define other parameters here!
// you may define other macros here!
// you may define other functions here!
dim3 getDimGrid(const int m, const int n) {
dim3 dimGrid(n/TILEX,n/TILEY);
return dimGrid;
}
dim3 getDimBlock(const int m, const int n) {
dim3 dimBlock(TILEX,TILEY);
return dimBlock;
}
__global__ void kernelFunc(float* ad, float* bd, float* cd, const int m, const int n)
{
// write your GPU kernel function here
__shared__ float As[TILEY][TILEMAX]; // = A[bx*TILEY:(bx+1)*TILEY][:]
__shared__ float Bs[TILEMAX][TILEX]; // = B[:][bx*TILEX:(bx+1)*TILEX]
float temp = 0;
for (int index = 0; index < (n/TILEMAX); index++)
{
for (int asIndex = 0; asIndex < (TILEMAX / TILEX); asIndex++)
{
As[ty][TILEX * asIndex + tx] = mem2d(ad, m, (by * TILEY + ty), (index * TILEMAX + asIndex * TILEX + tx));
}
for (int bsIndex = 0; bsIndex < (TILEMAX / TILEY); bsIndex++)
{
//int bsIndex = 1;
Bs[TILEY * bsIndex + ty][tx] = mem2d(bd, m, (index * TILEMAX + bsIndex * TILEY + ty), (bx * TILEX + tx));
}
__syncthreads();
for (int k = 0; k < TILEMAX; k++)
{
temp += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
mem2d(cd, m, (by * TILEY + ty), (bx * TILEX + tx)) = temp;
}