-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathgen_gpu.cu
67 lines (46 loc) · 1.43 KB
/
gen_gpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include "gen_gpu.h"
// ******************** General Mat-Mat Functions ******************
__global__ void gen_matvec(float *A, float *x, float *y, const int m, const int n)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
if ( xIndex < m ){
float c = 0.0f;
for(int i=0; i<n; i++)
c = c + x[i] * A[xIndex + m * i];
y[xIndex] = c;
}
}
__global__ void gen_matvecT(float *A, float *x, float *y, const int m, const int n)
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
if ( xIndex < n ) {
float c = 0.0f;
for(int i=0; i<m; i++)
c = c + y[i] * A[xIndex * m + i];
x[xIndex] = c;
}
}
/*
*******************************
** The matrix multiplication **
*******************************
*/
void A_gen(float * out, float * in, float * A, const int m, const int n, dim3 numBlocksm, dim3 threadsPerBlockm)
{
// perform the multiplication
gen_matvec <<< numBlocksm, threadsPerBlockm >>>((float*)A, (float*)in, (float*)out, m, n);
cudaThreadSynchronize();
return;
}
/*
*****************************************
** The matrix Transpose multiplication **
*****************************************
*/
void AT_gen(float * out, float * in, float * A, const int m, const int n, dim3 numBlocks, dim3 threadsPerBlock)
{
// perform the multiplication
gen_matvecT <<< numBlocks, threadsPerBlock >>>((float*)A, (float*)out, (float*)in, m, n);
cudaThreadSynchronize();
return;
}