-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmat_nn_dpcpp.hpp
119 lines (101 loc) · 4.24 KB
/
mat_nn_dpcpp.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// Intel DPCPP implementation
#include <sycl/sycl.hpp>
#define THREADS_PER_SITE 36
// Sycl requires that kernels be named
class k_mat_nn;
double su3_mat_nn(const std::vector<site> &a, const std::vector<su3_matrix> &b, std::vector<site> &c,
const size_t total_sites, const size_t iterations, size_t wgsize, const int target, Profile* profile)
{
// build a list of devices
std::vector<sycl::platform> platforms = sycl::platform::get_platforms();
std::vector<sycl::device> devices;
for (size_t i=0, d=0; i < platforms.size(); ++i) {
std::vector<sycl::device> pdevices = platforms[i].get_devices();
for (size_t j=0; j < pdevices.size(); ++j, ++d) {
devices.insert(devices.end(), pdevices[j]);
if (verbose >= 3)
std::cout << "Appending device " << d << ": " << pdevices[j].get_info<sycl::info::device::name>() << std::endl;
}
}
sycl::queue queue{sycl::device::get_devices()[target]};
if (verbose >= 2)
std::cout << "Using device: " << queue.get_device().get_info<sycl::info::device::name>() << "\n";
// FYI, look at device maximums
if (verbose >= 3) {
std::cout << "max compute units = "
<< queue.get_device().get_info<sycl::info::device::max_compute_units>() << "\n";
std::cout << "max workgroup size = "
<< queue.get_device().get_info<sycl::info::device::max_work_group_size>() << "\n";
}
// check to make sure the workgroup size is sufficient for the algorithm
if (wgsize == 0)
wgsize = THREADS_PER_SITE;
// set the total number of work items
size_t total_wi = total_sites * THREADS_PER_SITE;
if (verbose >= 3) {
std::cout << "Setting number of work items " << total_wi << std::endl;
std::cout << "Workgroup size is " << wgsize << std::endl;
}
std::cout << std::flush;
auto tprofiling = Clock::now();
// allocate device memory
site* d_a = (site*) malloc_device(total_sites * sizeof(site), queue);
su3_matrix* d_b = (su3_matrix*) malloc_device(4 * sizeof(su3_matrix), queue);
site* d_c = (site*) malloc_device(total_sites * sizeof(site), queue);
if (d_a == NULL || d_b == NULL || d_c == NULL) {
std::cout << "Unable to allocate device memory " << std::endl;
exit(1);
}
// Move host side memory to device allocated buffers
queue.memcpy(d_a, a.data(), a.size() * sizeof(site));
queue.memcpy(d_b, b.data(), b.size() * sizeof(su3_matrix));
queue.wait();
profile->host_to_device_time = (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
// benchmark loop
auto tstart = Clock::now();
tprofiling = tstart;
for (size_t iters=0; iters<iterations+warmups; ++iters) {
if (iters == warmups) {
queue.wait();
tstart = Clock::now();
tprofiling = tstart;
}
// create a command_group to issue commands
queue.submit([&](sycl::handler& cgh) {
// Lambda function defines the kernel scope
cgh.parallel_for<class k_mat_nn>(
sycl::nd_range<1> {total_wi, wgsize}, [=](sycl::nd_item<1> item) {
size_t myThread = item.get_global_id(0);
size_t mySite = myThread/36;
if (mySite < total_sites) {
int j = (myThread%36)/9;
int k = (myThread%9)/3;
int l = myThread%3;
Complx cc = {0.0, 0.0};
for (int m=0;m<3;m++) {
const auto aa = d_a[mySite].link[j].e[k][m];
const auto bb = d_b[j].e[m][l];
#ifndef MILC_COMPLEX
cc += aa * bb;
#else
CMULSUM(aa, bb, cc);
#endif
}
d_c[mySite].link[j].e[k][l] = cc;
}
}); // end of the kernel lambda function
}); // end of command group
queue.wait();
} // end of iteration loop
double ttotal = std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tstart).count();
profile->kernel_time = (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
// Move the result back to the host side vector
tprofiling = Clock::now();
queue.memcpy(c.data(), d_c, c.size() * sizeof(site));
queue.wait();
profile->device_to_host_time= (std::chrono::duration_cast<std::chrono::microseconds>(Clock::now()-tprofiling).count())/1.0e6;
free(d_a, queue);
free(d_b, queue);
free(d_c, queue);
return (ttotal /= 1.0e6);
} // end of SYCL block