-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingle_column_coalesced_seq.py
483 lines (389 loc) · 19.5 KB
/
single_column_coalesced_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
# (C) Copyright 2018- ECMWF.
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
from more_itertools import split_at
from loki.expression import symbols as sym
from loki.transform import resolve_associates
from loki import ir
from loki import (
Transformation, FindNodes, FindScopes, FindVariables,
FindExpressions, Transformer, NestedTransformer,
SubstituteExpressions, SymbolAttributes, BasicType, DerivedType,
pragmas_attached, CaseInsensitiveDict, as_tuple, flatten,
demote_variables
)
__all__ = ['SingleColumnCoalescedTransformationSeq']
def get_integer_variable(routine, name):
"""
Find a local variable in the routine, or create an integer-typed one.
Parameters
----------
routine : :any:`Subroutine`
The subroutine in which to find the variable
name : string
Name of the variable to find the in the routine.
"""
if name in routine.variable_map:
v_index = routine.variable_map[name]
else:
dtype = SymbolAttributes(BasicType.INTEGER)
v_index = sym.Variable(name=name, type=dtype, scope=routine)
return v_index
def kernel_remove_vector_loops(routine, horizontal):
"""
Remove all vector loops over the specified dimension.
Parameters
----------
routine : :any:`Subroutine`
The subroutine in the vector loops should be removed.
horizontal : :any:`Dimension`
The dimension specifying the horizontal vector dimension
"""
loop_map = {}
for loop in FindNodes(ir.Loop).visit(routine.body):
if loop.variable == horizontal.index:
loop_map[loop] = loop.body
routine.body = Transformer(loop_map).visit(routine.body)
def kernel_get_locals_to_demote(routine, horizontal):
argument_names = [v.name for v in routine.arguments]
def _is_constant(d):
"""Establish if a given dimensions symbol is a compile-time constant"""
if isinstance(d, sym.IntLiteral):
return True
if isinstance(d, sym.RangeIndex):
if d.lower:
return _is_constant(d.lower) and _is_constant(d.upper)
return _is_constant(d.upper)
if isinstance(d, sym.Scalar) and isinstance(d.initial , sym.IntLiteral):
return True
return False
def _get_local_arrays(section):
"""
Filters out local argument arrays that solely buffer the
horizontal vector dimension
"""
arrays = FindVariables(unique=False).visit(section)
# Only demote local arrays with the horizontal as fast dimension
arrays = [v for v in arrays if isinstance(v, sym.Array)]
arrays = [v for v in arrays if v.name not in argument_names]
arrays = [v for v in arrays if v.shape and v.shape[0] == horizontal.size]
# Also demote arrays whose remaning dimensions are known constants
arrays = [v for v in arrays if all(_is_constant(d) for d in v.shape[1:])]
return arrays
# Create a list of all local horizontal temporary arrays
candidates = _get_local_arrays(routine.body)
# Create an index into all variable uses per vector-level section
# vars_per_section = {s: set(v.name.lower() for v in _get_local_arrays(s)) for s in sections}
# Count in how many sections each temporary is used
# counts = {}
# for arr in candidates:
# counts[arr] = sum(1 if arr.name.lower() in v else 0 for v in vars_per_section.values())
# Mark temporaries that are only used in one section for demotion
# to_demote = [k for k, v in counts.items() if v == 1]
# Filter out variables that we will pass down the call tree
calls = FindNodes(ir.CallStatement).visit(routine.body)
call_args = flatten(call.arguments for call in calls)
call_args += flatten(list(dict(call.kwarguments).values()) for call in calls)
# to_demote = [v for v in to_demote if v.name not in call_args]
to_demote = [v for v in candidates if v.name not in call_args]
return set(to_demote)
def kernel_annotate_sequential_loops_openacc(routine, horizontal):
"""
Insert ``!$acc loop seq`` annotations around all loops that
are not horizontal vector loops.
Parameters
----------
routine : :any:`Subroutine`
The subroutine in which to annotate sequential loops
horizontal: :any:`Dimension`
The dimension object specifying the horizontal vector dimension
"""
with pragmas_attached(routine, ir.Loop):
for loop in FindNodes(ir.Loop).visit(routine.body):
# Skip loops explicitly marked with `!$loki/claw nodep`
if loop.pragma and any('nodep' in p.content.lower() for p in as_tuple(loop.pragma)):
continue
if loop.variable != horizontal.index:
# Perform pragma addition in place to avoid nested loop replacements
loop._update(pragma=ir.Pragma(keyword='acc', content='loop seq'))
def resolve_masked_stmts(routine, loop_variable):
"""
Resolve :any:`MaskedStatement` (WHERE statement) objects to an
explicit combination of :any:`Loop` and :any:`Conditional` combination.
Parameters
----------
routine : :any:`Subroutine`
The subroutine in which to resolve masked statements
loop_variable : :any:`Scalar`
The induction variable for the created loops.
"""
mapper = {}
for masked in FindNodes(ir.MaskedStatement).visit(routine.body):
# TODO: Currently limited to simple, single-clause WHERE stmts
assert len(masked.conditions) == 1 and len(masked.bodies) == 1
ranges = [e for e in FindExpressions().visit(masked.conditions[0]) if isinstance(e, sym.RangeIndex)]
exprmap = {r: loop_variable for r in ranges}
assert len(ranges) > 0
assert all(r == ranges[0] for r in ranges)
bounds = sym.LoopRange((ranges[0].start, ranges[0].stop, ranges[0].step))
cond = ir.Conditional(condition=masked.conditions[0], body=masked.bodies[0], else_body=masked.default)
loop = ir.Loop(variable=loop_variable, bounds=bounds, body=cond)
# Substitute the loop ranges with the loop index and add to mapper
mapper[masked] = SubstituteExpressions(exprmap).visit(loop)
routine.body = Transformer(mapper).visit(routine.body)
def resolve_vector_dimension(routine, loop_variable, bounds):
"""
Resolve vector notation for a given dimension only. The dimension
is defined by a loop variable and the bounds of the given range.
TODO: Consolidate this with the internal
`loki.transform.transform_array_indexing.resolve_vector_notation`.
Parameters
----------
routine : :any:`Subroutine`
The subroutine in which to resolve vector notation usage.
loop_variable : :any:`Scalar`
The induction variable for the created loops.
bounds : tuple of :any:`Scalar`
Tuple defining the iteration space of the inserted loops.
"""
bounds_str = f'{bounds[0]}:{bounds[1]}'
bounds_v = (sym.Variable(name=bounds[0]), sym.Variable(name=bounds[1]))
mapper = {}
for stmt in FindNodes(ir.Assignment).visit(routine.body):
ranges = [e for e in FindExpressions().visit(stmt)
if isinstance(e, sym.RangeIndex) and e == bounds_str]
if ranges:
exprmap = {r: loop_variable for r in ranges}
loop = ir.Loop(variable=loop_variable, bounds=sym.LoopRange(bounds_v),
body=(SubstituteExpressions(exprmap).visit(stmt),) )
mapper[stmt] = loop
routine.body = Transformer(mapper).visit(routine.body)
class SingleColumnCoalescedTransformationSeq(Transformation):
"""
Single Column Coalesced: Direct CPU-to-GPU transformation for
block-indexed gridpoint routines.
This transformation will remove individiual CPU-style
vectorization loops from "kernel" routines and either either
re-insert the vector loop at the highest possible level (without
interfering with subroutine calls), or completely strip it and
promote the index variable to the driver if
``hoist_column_arrays`` is set.
Unlike the CLAW-targetting SCA extraction, this will leave the
block-based array passing structure in place, but pass a
thread-local array index into any "kernel" routines. The
block-based argument passing should map well to coalesced memory
accesses on GPUs.
Note, this requires preprocessing with the
:class:`DerivedTypeArgumentsTransformation`.
Parameters
----------
horizontal : :any:`Dimension`
:any:`Dimension` object describing the variable conventions used in code
to define the horizontal data dimension and iteration space.
vertical : :any:`Dimension`
:any:`Dimension` object describing the variable conventions used in code
to define the vertical dimension, as needed to decide array privatization.
block_dim : :any:`Dimension`
Optional ``Dimension`` object to define the blocking dimension
to use for hoisted column arrays if hoisting is enabled.
directive : string or None
Directives flavour to use for parallelism annotations; either
``'openacc'`` or ``None``.
hoist_column_arrays : bool
Flag to trigger the more aggressive "column array hoisting"
optimization.
"""
def __init__(self, horizontal, vertical=None, block_dim=None, directive=None,
demote_local_arrays=True, hoist_column_arrays=True):
self.horizontal = horizontal
self.vertical = vertical
self.block_dim = block_dim
assert directive in [None, 'openacc']
self.directive = directive
self.demote_local_arrays = demote_local_arrays
self.hoist_column_arrays = hoist_column_arrays
def transform_subroutine(self, routine, **kwargs):
"""
Apply transformation to convert a :any:`Subroutine` to SCC format.
Parameters
----------
routine : :any:`Subroutine`
Subroutine to apply this transformation to.
role : string
Role of the subroutine in the call tree; either
``"driver"`` or ``"kernel"``
targets : list of strings
Names of all kernel routines that are to be considered "active"
in this call tree and should thus be processed accordingly.
"""
role = kwargs['role']
item = kwargs.get('item', None)
targets = kwargs.get('targets', None)
if role == 'driver':
self.process_driver(routine, targets=targets)
if role == 'kernel':
demote_locals = self.demote_local_arrays
if item:
demote_locals = item.config.get('demote_locals', self.demote_local_arrays)
self.process_kernel(routine, demote_locals=demote_locals)
def process_kernel(self, routine, demote_locals=True):
"""
Applies the SCC loop layout transformation to a "kernel"
subroutine. This will primarily strip the innermost vector
loops and either re-insert the vector loop at the highest
possible level (without interfering with subroutine calls),
or completely strip it and promote the index variable to the
driver if ``hoist_column_arrays`` is set.
In both cases argument arrays are left fully dimensioned,
allowing us to use them in recursive subroutine invocations.
Parameters
----------
routine : :any:`Subroutine`
Subroutine to apply this transformation to.
"""
pragmas = FindNodes(ir.Pragma).visit(routine.body)
routine_pragmas = [p for p in pragmas if p.keyword.lower() in ['loki', 'acc']]
routine_pragmas = [p for p in routine_pragmas if 'routine' in p.content.lower()]
seq_pragmas = [r for r in routine_pragmas if 'seq' in r.content.lower()]
if seq_pragmas:
if self.directive == 'openacc':
# Mark routine as acc seq
mapper = {seq_pragmas[0]: ir.Pragma(keyword='acc', content='routine seq')}
routine.body = Transformer(mapper).visit(routine.body)
# Bail and leave sequential routines unchanged
return
vec_pragmas = [r for r in routine_pragmas if 'vector' in r.content.lower()]
if vec_pragmas:
if self.directive == 'openacc':
# Bail routines that have already been marked and this processed
# TODO: This is a hack until we can avoid redundant re-application
return
if self.horizontal.bounds[0] not in routine.variable_map:
raise RuntimeError(f'No horizontal start variable found in {routine.name}')
if self.horizontal.bounds[1] not in routine.variable_map:
raise RuntimeError(f'No horizontal end variable found in {routine.name}')
# raise RuntimeError(f'horizontal start and end variables found in {routine.name} !!!!')
# Find the iteration index variable for the specified horizontal
v_index = get_integer_variable(routine, name=self.horizontal.index)
# Associates at the highest level, so they don't interfere
# with the sections we need to do for detecting subroutine calls
resolve_associates(routine)
# Resolve WHERE clauses
resolve_masked_stmts(routine, loop_variable=v_index)
# Resolve vector notation, eg. VARIABLE(KIDIA:KFDIA)
resolve_vector_dimension(routine, loop_variable=v_index, bounds=self.horizontal.bounds)
# Remove all vector loops over the specified dimension
kernel_remove_vector_loops(routine, self.horizontal)
# Demote all private local variables having only horizontal dimension
if demote_locals:
to_demote = kernel_get_locals_to_demote(routine, self.horizontal)
variables = tuple(v.name for v in to_demote)
if variables:
demote_variables(routine, variable_names=variables, dimensions=self.horizontal.size)
# Add loop index variable
if v_index not in routine.arguments:
new_v = v_index.clone(type=v_index.type.clone(intent='in'))
# Remove original variable first, since we need to update declaration
routine.variables = as_tuple(v for v in routine.variables if v != v_index)
routine.arguments += as_tuple(new_v)
call_map = {}
for call in FindNodes(ir.CallStatement).visit(routine.body):
# Append new loop variable to call signature
new_call = call.clone(arguments=call.arguments)
new_call._update(kwarguments=new_call.kwarguments + ((self.horizontal.index, v_index),))
call_map[call] = new_call
routine.body = Transformer(call_map).visit(routine.body)
# Mark all non-parallel loops as `!$acc loop seq`
kernel_annotate_sequential_loops_openacc(routine, self.horizontal)
# Mark routine as `!$acc routine seq` to make it device-callable
routine.spec.append(ir.Pragma(keyword='acc', content='routine seq'))
def process_driver(self, routine, targets=None):
"""
Process the "driver" routine by inserting the other level
parallel loops, and optionally hoisting temporary column
arrays.
Note that if ``hoist_column_arrays`` is set, the driver needs
to be processed before any kernels are trnasformed. This is
due to the use of an interprocedural analysis forward pass
needed to collect the list of "column arrays".
Parameters
----------
routine : :any:`Subroutine`
Subroutine to apply this transformation to.
targets : list or string
List of subroutines that are to be considered as part of
the transformation call tree.
"""
# Resolve associates, since the PGI compiler cannot deal with
# implicit derived type component offload by calling device
# routines.
resolve_associates(routine)
with pragmas_attached(routine, ir.Loop, attach_pragma_post=True):
# add_horizontal_loop_to_kernel_call applies a transformation to the routine body
# This messes up the first loop in case of multiple driver calls
# Putting the calls in a list then call add_horizontal_... after the first loop solves this
calls_to_hoist=[]
for call in FindNodes(ir.CallStatement).visit(routine.body):
if not call.name in targets:
continue
# Find the driver loop by checking the call's heritage
ancestors = flatten(FindScopes(call).visit(routine.body))
loops = [a for a in ancestors if isinstance(a, ir.Loop)]
if not loops:
# Skip if there are no driver loops
continue
loop = loops[0]
# Mark driver loop as "gang parallel".
if self.directive == 'openacc':
if loop.pragma is None:
loop._update(pragma=(ir.Pragma(keyword='acc', content='parallel loop gang vector_length(32)'), ))
loop._update(pragma_post=(ir.Pragma(keyword='acc', content='end parallel loop'), ))
calls_to_hoist.append(call)
# Apply hoisting of temporary "column arrays"
for call in calls_to_hoist:
self.add_horizontal_loop_to_kernel_call(routine, call)
def add_horizontal_loop_to_kernel_call(self, routine, call):
"""
Add an horizontal loop around kernel call at the driver level
with an ``!$acc loop vector directive``.
Also passes the loop variable as an additional positional
argument in the kernel call.
Parameters
----------
routine : :any:`Subroutine`
Subroutine to apply this transformation to.
call : :any:`CallStatement`
Call to subroutine from which we hoist the column arrays.
"""
if call.not_active or call.routine is BasicType.DEFERRED:
raise RuntimeError(
'[Loki] SingleColumnCoalescedTransform: Target kernel is not attached '
'to call in driver routine.'
)
if not self.block_dim:
raise RuntimeError(
'[Loki] SingleColumnCoalescedTransform: No blocking dimension found '
'for column hoisting.'
)
kernel = call.routine
call_map = {}
# Find the iteration index variable for the specified horizontal
v_index = get_integer_variable(routine, name=self.horizontal.index)
if v_index.name not in routine.variable_map:
routine.variables += as_tuple(v_index)
# Append new loop variable to call signature
new_call = call.clone(arguments=call.arguments)
new_call._update(kwarguments=new_call.kwarguments + ((self.horizontal.index, v_index),))
# Create a vector loop around the kernel invocation
pragma = None
if self.directive == 'openacc':
pragma = ir.Pragma(keyword='acc', content='loop vector')
v_start =kernel.variable_map[self.horizontal.bounds[0]]
v_end = kernel.variable_map[self.horizontal.bounds[1]]
bounds = sym.LoopRange((v_start, v_end))
vector_loop = ir.Loop(variable=v_index, bounds=bounds, body=[new_call], pragma=(pragma,))
call_map[call] = vector_loop
routine.body = Transformer(call_map).visit(routine.body)