-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsu3_nn_bench.cpp
300 lines (273 loc) · 8.24 KB
/
su3_nn_bench.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/resource.h>
#include <math.h>
#include <vector>
#include <iostream>
#include <string>
#include <cassert>
#include <cmath>
#include <complex>
#include <chrono>
typedef std::chrono::system_clock Clock;
#ifdef USE_OPENMP
#include <omp.h>
#endif
#ifndef ITERATIONS
# define ITERATIONS 100
#endif
#ifndef LDIM
# define LDIM 32 // Lattice size = LDIM^4
#endif
#ifndef PRECISION
# define PRECISION 2 // 1->single, 2->double
#endif
// Global variables
unsigned int verbose=1;
size_t warmups=1;
// global argc and argv for parsing model specific parameters
int g_argc;
char **g_argv;
#include "lattice.hpp"
// validation function used by main()
template<class T>
bool almost_equal(T x, T y, double tol)
{
if (std::isnan(x) || std::isnan(y))
return (0);
return std::abs( x - y ) < tol ;
}
// std::isnan() lacks complex support, so need a complex template
template<class T>
bool almost_equal(std::complex<T> x, std::complex<T> y, double tol)
{
if (std::isnan(x.real()) || std::isnan(x.imag())
|| std::isnan(y.real()) || std::isnan(y.imag()) )
return (0);
return std::abs( x - y ) < tol ;
}
// Thrust version
#ifdef USE_THRUST
template<class T>
bool almost_equal(thrust::complex<T> x, thrust::complex<T> y, double tol)
{
if (std::isnan(x.real()) || std::isnan(x.imag())
|| std::isnan(y.real()) || std::isnan(y.imag()) )
return (0);
return thrust::abs( x - y ) < tol ;
}
#endif
// Kokkos version
#if defined(USE_KOKKOS)
template <class T>
bool almost_equal(Kokkos::complex<T> x, Kokkos::complex<T> y, double tol) {
if (std::isnan(x.real()) || std::isnan(x.imag()) || std::isnan(y.real()) ||
std::isnan(y.imag()))
return (0);
return Kokkos::abs(x - y) < tol;
}
#endif
#ifdef RANDOM_INIT
#include <random>
std::random_device rd; //Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> dis(-1.f, 1.f);
#endif
// initializes su3_matrix to a given value
void init_link(su3_matrix *s, Complx val) {
for(int j=0; j<4; ++j) for(int k=0; k<3; ++k) for(int l=0; l<3; ++l) {
#ifndef RANDOM_INIT
s[j].e[k][l] = val;
#elif !defined MILC_COMPLEX
s[j].e[k][l]=dis(gen);
#else
s[j].e[k][l].real=dis(gen);
s[j].e[k][l].imag=dis(gen);
#endif
}
}
// initializes a lattice site
void make_lattice(site *s, size_t n, Complx val) {
int nx=n;
int ny=n;
int nz=n;
int nt=n;
#pragma omp parallel for
for(int t=0;t<nt;t++) {
int i=t*nz*ny*nx;
for(int z=0;z<nz;z++)for(int y=0;y<ny;y++)for(int x=0;x<nx;x++,i++){
s[i].x=x; s[i].y=y; s[i].z=z; s[i].t=t;
s[i].index = x+nx*(y+ny*(z+nz*t));
if( (x+y+z+t)%2 == 0)
s[i].parity=EVEN;
else
s[i].parity=ODD;
init_link(&s[i].link[0], val);
}
}
}
// Include the programming model specific function for su3_mat_nn()
#ifdef USE_CUDA
#include "mat_nn_cuda.hpp"
#elif USE_OPENMP
#include "mat_nn_openmp.hpp"
#elif USE_OPENMP_CPU
#include "mat_nn_openmp2.hpp"
#elif USE_OPENACC
#include "mat_nn_openacc.hpp"
#elif USE_OPENCL
// OpenCL 1.2 doesn't support complex data types
#define MILC_COMPLEX
#include "mat_nn_opencl.hpp"
#elif USE_SYCL
#include "mat_nn_sycl.hpp"
#elif USE_DPCPP
#include "mat_nn_dpcpp.hpp"
#elif USE_HIP
#include "mat_nn_hip.hpp"
#elif USE_KOKKOS
#include "mat_nn_kokkos.hpp"
#elif USE_RAJA
#include "mat_nn_raja.hpp"
#else
#error Unknown programming model
#endif
// Main
int main(int argc, char **argv)
{
Profile profile;
size_t iterations = ITERATIONS;
size_t ldim = LDIM;
size_t threads_per_group = 128; // nominally works well across implementations
#ifdef USE_DPCPP
int device = 0; // DPCPP seg faults when device not provided
#else
int device = -1; // Let implementation choose the device
#endif
std::string csv_filename = "";
int opt;
g_argc = argc;
g_argv = argv;
// parse command line for parameters
// the options list must include flags used by the various
// su3_mat_nn() implementations internally,
// as getopt rearrages the order of arguments and
// can screw things up for unknown options
while ((opt=getopt(argc, argv, ":hi:l:t:v:d:w:n:c:")) != -1) {
switch (opt) {
case 'i':
iterations = atoi(optarg);
break;
case 'l':
ldim = atoi(optarg);
break;
case 't':
threads_per_group = atoi(optarg);
break;
case 'v':
verbose = atoi(optarg);
break;
case 'd':
device = atoi(optarg);
break;
case 'w':
warmups = atoi(optarg);
break;
case 'c':
csv_filename = optarg;
break;
case 'h':
fprintf(stderr, "Usage: %s [-i iterations] [-l lattice dimension] \
[-t threads per workgroup] [-d device] [-v verbosity level [0,1,2,3]] [-w warmups] [-c csv-file]\n", argv[0]);
exit (EXIT_SUCCESS);
}
}
// allocate and initialize the working lattices and B su3 matrices
size_t total_sites = ldim*ldim*ldim*ldim;
#ifdef USE_KOKKOS
Kokkos::ScopeGuard scope(argc, argv);
printf("Kokkos::ExecutionSpace = %s\n", typeid(ExecSpace).name());
h_site_view a("a", total_sites);
h_site_view c("c", total_sites);
h_su3_matrix_view b("b", 4);
#else
std::vector<site> a(total_sites);
std::vector<su3_matrix> b(4);
std::vector<site> c(total_sites);
#endif
#ifdef USE_OPENMP_CPU
first_touch(a.data(), b.data(), c.data(), total_sites);
#endif
// initialize the lattices
make_lattice(a.data(), ldim, Complx{1.0,0.0});
init_link(b.data(), Complx{1.0/3.0,0.0});
if (verbose >= 1) {
printf("Number of sites = %zu^4\n", ldim);
printf("Executing %zu iterations with %zu warmups\n", iterations, warmups);
}
// benchmark call
const double ttotal = su3_mat_nn(a, b, c, total_sites, iterations, threads_per_group, device, &profile);
if (verbose >= 1) {
printf("Total execution time = %f secs\n", ttotal);
printf("host_to_device_ms,kernel_ms,device_to_host_ms,num_iterations,num_warmups\n");
printf("%f,%f,%f,%lu,%lu\n",
profile.host_to_device_time*1000,
profile.kernel_time*1000,
profile.device_to_host_time*1000,
iterations,
warmups);
}
if (csv_filename != "") {
FILE* output = fopen(csv_filename.c_str(), "w");
fprintf(output, "host_to_device_ms,kernel_ms,device_to_host_ms,num_iterations,num_warmups\n");
fprintf(output, "%f,%f,%f,%lu,%lu\n",
profile.host_to_device_time*1000,
profile.kernel_time*1000,
profile.device_to_host_time*1000,
iterations,
warmups);
fclose(output);
}
// calculate flops/s, etc.
// each matrix multiply is (3*3)*4*(12 mult + 12 add) = 4*(108 mult + 108 add) = 4*216 ops
const double tflop = (double)total_sites * 864.0;
printf("Total GFLOP/s = %.3f\n", iterations * tflop / ttotal / 1.0e9);
const double memory_usage = (double)sizeof(site) * (a.size() + c.size()) + sizeof(su3_matrix) * b.size();
printf("Total GByte/s (GPU memory) = %.3f\n", iterations * memory_usage / ttotal / 1.0e9);
fflush(stdout);
// Verification of the result
bool result = true;
for (size_t i=0;i<total_sites;++i) for(int j=0;j<4;++j) for(int k=0;k<3;++k) for(int l=0;l<3;++l) {
Complx cc = {0.0, 0.0};
for(int m=0;m<3;m++) {
#ifdef MILC_COMPLEX
CMULSUM( a[i].link[j].e[k][m], b[j].e[m][l], cc)
#elif USE_KOKKOS
cc += a(i).link[j].e[k][m] * b[j].e[m][l];
#else
cc += a[i].link[j].e[k][m] * b[j].e[m][l];
#endif
}
#ifdef MILC_COMPLEX
result = almost_equal(c[i].link[j].e[k][l].real, cc.real, 1E-6);
result = almost_equal(c[i].link[j].e[k][l].imag, cc.imag, 1E-6);
#elif USE_KOKKOS
result = almost_equal(c(i).link[j].e[k][l], cc, 1E-6);
#else
result = almost_equal(c[i].link[j].e[k][l], cc, 1E-6);
#endif
if (!result) {
fprintf(stderr, "Verification Failed!\n");
return EXIT_FAILURE;
}
}
// check memory usage
if (verbose >= 2) {
printf("Total allocation for matrices = %.3f MiB\n", memory_usage / 1048576.0);
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) == 0)
printf("Approximate memory usage = %.3f MiB\n", (float)usage.ru_maxrss/1024.0);
}
return EXIT_SUCCESS;
}