From 6e3e42feefee092b698e730983435b140444d3df Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Mon, 20 Jun 2022 21:11:03 +0300 Subject: [PATCH 01/28] deprecate DeviceDataRecord --- boxtree/tools.py | 88 +++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 31 deletions(-) diff --git a/boxtree/tools.py b/boxtree/tools.py index 244c71eb..0af213d6 100644 --- a/boxtree/tools.py +++ b/boxtree/tools.py @@ -29,7 +29,6 @@ import pyopencl as cl import pyopencl.array -import pyopencl.cltypes as cltypes from pyopencl.tools import ScalarArg, VectorArg as _VectorArg, dtype_to_c_struct from pytools import Record, memoize_method from pytools.obj_array import make_obj_array @@ -68,7 +67,7 @@ def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=No def reverse_index_array(indices, target_size=None, result_fill_value=None, queue=None): """For an array of *indices*, return a new array *result* that satisfies - ``result[indices] == arange(len(indices)) + ``result[indices] == arange(len(indices))`` :arg target_n: The length of the result, or *None* if the result is to have the same length as *indices*. @@ -280,18 +279,17 @@ def particle_array_to_host(parray): # {{{ host/device data storage class DeviceDataRecord(Record): - """A record of array-type data. Some of this data may live in - :class:`pyopencl.array.Array` objects. :meth:`get` can then be - called to convert all these device arrays into :mod:`numpy.ndarray` - instances on the host. + """A record of array-type data. + + Some of this data may live in :class:`pyopencl.array.Array` objects. + :meth:`get` can then be called to convert all these device arrays into + :mod:`numpy.ndarray` instances on the host. """ def _transform_arrays(self, f, exclude_fields=frozenset()): - result = {} - def transform_val(val): from pyopencl.algorithm import BuiltList - if isinstance(val, np.ndarray) and val.dtype == object: + if isinstance(val, np.ndarray) and val.dtype.char == "O": from pytools.obj_array import obj_array_vectorize return obj_array_vectorize(f, val) elif isinstance(val, list): @@ -305,7 +303,17 @@ def transform_val(val): else: return f(val) - for field_name in self.__class__.fields: + from dataclasses import fields, is_dataclass + + if is_dataclass(self): + fields = [f.name for f in fields(self)] + elif isinstance(self, Record): + fields = self.__class__.fields + else: + raise TypeError(f"unknown record type: '{type(self).__name__}'") + + result = {} + for field_name in fields: if field_name in exclude_fields: continue @@ -319,50 +327,61 @@ def transform_val(val): return self.copy(**result) def get(self, queue, **kwargs): - """Return a copy of `self` in which all data lives on the host, i.e. - all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` objects are - replaced by corresponding :class:`numpy.ndarray` instances on the host. """ + :returns: a copy of *self* in which all data lives on the host, i.e. + all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` + objects are replaced by corresponding :class:`numpy.ndarray` + instances on the host. + """ + from warnings import warn + warn(f"{type(self).__name__}.get is deprecated and will be removed " + "in 2025. Switch to using arraycontext.to_numpy instead.", + DeprecationWarning, stacklevel=2) + def try_get(attr): if isinstance(attr, ImmutableHostDeviceArray): return attr.host try: - get_meth = attr.get + return attr.get(queue=queue, **kwargs) except AttributeError: return attr - return get_meth(queue=queue, **kwargs) - return self._transform_arrays(try_get) def with_queue(self, queue): - """Return a copy of `self` in - all :class:`pyopencl.array.Array` objects are assigned to - :class:`pyopencl.CommandQueue` *queue*. """ + :returns: a copy of *self* in all :class:`pyopencl.array.Array` objects + are assigned to the :class:`pyopencl.CommandQueue` *queue*. + """ + from warnings import warn + warn(f"{type(self).__name__}.with_queue is deprecated and will be removed " + "in 2025. Switch to using arraycontext.with_array_context instead.", + DeprecationWarning, stacklevel=2) def try_with_queue(attr): if isinstance(attr, cl.array.Array): attr.finish() try: - wq_meth = attr.with_queue + return attr.with_queue(queue) except AttributeError: return attr - ary = wq_meth(queue) - return ary - return self._transform_arrays(try_with_queue) def to_device(self, queue, exclude_fields=frozenset()): - """Return a copy of `self` in all :class:`numpy.ndarray` arrays are - transferred to device memory as :class:`pyopencl.array.Array` objects. + """ + :arg exclude_fields: a :class:`frozenset` containing fields excluded + from transferring to the device memory. - :arg exclude_fields: a :class:`frozenset` containing fields excluding from - transferring to the device memory. + :returns: a copy of *self* in all :class:`numpy.ndarray` arrays are + transferred to device memory as :class:`pyopencl.array.Array` objects. """ + from warnings import warn + warn(f"{type(self).__name__}.to_device is deprecated and will be removed " + "in 2025. Switch to using arraycontext.from_numpy instead.", + DeprecationWarning, stacklevel=2) def _to_device(attr): if isinstance(attr, np.ndarray): @@ -377,12 +396,18 @@ def _to_device(attr): return self._transform_arrays(_to_device, exclude_fields=exclude_fields) def to_host_device_array(self, queue, exclude_fields=frozenset()): - """Return a copy of `self` where all device and host arrays are transformed - to `ImmutableHostDeviceArray` objects. + """ + :arg exclude_fields: a :class:`frozenset` containing fields excluded + from transformed to `ImmutableHostDeviceArray`. - :arg exclude_fields: a :class:`frozenset` containing fields excluding from - transformed to `ImmutableHostDeviceArray`. + :returns: a copy of *self* where all device and host arrays are + transformed to `ImmutableHostDeviceArray` objects. """ + from warnings import warn + warn(f"{type(self).__name__}.to_host_device_array is deprecated and will " + "be removed in 2025. Switch from ImmutableHostDeviceArray.", + DeprecationWarning, stacklevel=2) + def _to_host_device_array(attr): if isinstance(attr, np.ndarray | cl.array.Array): return ImmutableHostDeviceArray(queue, attr) @@ -930,6 +955,7 @@ def device(self): def get_coord_vec_dtype( coord_dtype: np.dtype, dimensions: int) -> np.dtype: + import pyopencl.cltypes as cltypes if dimensions == 1: return coord_dtype else: From be48b2219dc54e0ef7c0d49966dbf97939131dc7 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Mon, 20 Jun 2022 21:42:34 +0300 Subject: [PATCH 02/28] port translation_classes to arraycontext --- boxtree/translation_classes.py | 107 ++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 49 deletions(-) diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py index 21035f04..073a4a9c 100644 --- a/boxtree/translation_classes.py +++ b/boxtree/translation_classes.py @@ -33,24 +33,23 @@ """ import logging +from dataclasses import dataclass from functools import partial import numpy as np +from arraycontext import Array from mako.template import Template -import pyopencl as cl -import pyopencl.array -import pyopencl.cltypes -from pyopencl.elementwise import ElementwiseTemplate -from pytools import Record, memoize_method +from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate +from pytools import memoize_method +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container from boxtree.tools import ( - DeviceDataRecord, InlineBinarySearch, coord_vec_subscript_code, get_coord_vec_dtype, ) -from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS +from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS, FMMTraversalInfo logger = logging.getLogger(__name__) @@ -184,11 +183,14 @@ """) -class _KernelInfo(Record): - pass +@dataclass(frozen=True) +class _KernelInfo: + translation_class_finder: ElementwiseKernel -class TranslationClassesInfo(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class TranslationClassesInfo: r"""Interaction lists to help with for translations that benefit from precomputing distance related values @@ -225,13 +227,10 @@ class id for that level. Translation classes are numbered contiguously traversal that these translation classes refer to. """ - def __init__(self, traversal, **kwargs): - super().__init__(**kwargs) - self.traversal = traversal - - def copy(self, **kwargs): - traversal = kwargs.pop("traversal", self.traversal) - return self.__class__(traversal=traversal, **self.get_copy_kwargs(**kwargs)) + traversal: FMMTraversalInfo + from_sep_siblings_translation_classes: Array + from_sep_siblings_translation_class_to_distance_vector: Array + from_sep_siblings_translation_classes_level_starts: Array @property def nfrom_sep_siblings_translation_classes(self): @@ -245,12 +244,21 @@ class TranslationClassesBuilder: .. automethod:: __call__ """ - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext) -> None: + self._setup_actx = array_context + + @property + def context(self): + return self._setup_actx.queue.context @memoize_method - def get_kernel_info(self, dimensions, well_sep_is_n_away, - box_id_dtype, box_level_dtype, coord_dtype, translation_class_per_level): + def get_kernel_info(self, + dimensions: int, + well_sep_is_n_away: int, + box_id_dtype: np.dtype, + box_level_dtype: np.dtype, + coord_dtype: np.dtype, + translation_class_per_level) -> None: coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions) int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions) @@ -287,11 +295,13 @@ def get_kernel_info(self, dimensions, well_sep_is_n_away, return _KernelInfo(translation_class_finder=translation_class_finder) @staticmethod - def ntranslation_classes_per_level(well_sep_is_n_away, dimensions): + def ntranslation_classes_per_level( + well_sep_is_n_away: int, dimensions: int) -> int: return (4 * well_sep_is_n_away + 3) ** dimensions - def translation_class_to_normalized_vector(self, well_sep_is_n_away, - dimensions, cls): + def translation_class_to_normalized_vector( + self, well_sep_is_n_away: int, dimensions: int, cls: type + ) -> np.ndarray: # This computes the vector for the translation class, using the inverse # of the formula found in get_translation_class() defined in # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE. @@ -303,13 +313,15 @@ def translation_class_to_normalized_vector(self, well_sep_is_n_away, for i in range(dimensions): result[i] = cls % base - shift cls //= base + return result - def compute_translation_classes(self, queue, trav, tree, wait_for, + def compute_translation_classes(self, + actx: PyOpenCLArrayContext, trav, tree, wait_for, is_translation_per_level): """ - Returns a tuple *evt*, *translation_class_is_used* and - *translation_classes_lists*. + :returns: a :class:`tuple` containing *evt*, *translation_class_is_used* + and *translation_classes_lists*. """ # {{{ compute translation classes for list 2 @@ -328,14 +340,11 @@ def compute_translation_classes(self, queue, trav, tree, wait_for, if is_translation_per_level: ntranslation_classes = ntranslation_classes * tree.nlevels - translation_classes_lists = cl.array.empty( - queue, len(trav.from_sep_siblings_lists), dtype=np.int32) - - translation_class_is_used = cl.array.zeros( - queue, ntranslation_classes, dtype=np.int32) - - error_flag = cl.array.zeros(queue, 1, dtype=np.int32) + translation_classes_lists = actx.np.zeros( + len(trav.from_sep_siblings_lists), dtype=np.int32) + translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32) + error_flag = actx.zeros(1, dtype=np.int32) evt = knl_info.translation_class_finder( trav.from_sep_siblings_lists, trav.from_sep_siblings_starts, @@ -349,9 +358,10 @@ def compute_translation_classes(self, queue, trav, tree, wait_for, translation_classes_lists, translation_class_is_used, error_flag, - queue=queue, wait_for=wait_for) + queue=actx.queue, + wait_for=wait_for) - if (error_flag.get()): + if actx.to_numpy(error_flag)[0]: raise ValueError("could not compute translation classes") return (evt, translation_class_is_used, translation_classes_lists) @@ -359,13 +369,13 @@ def compute_translation_classes(self, queue, trav, tree, wait_for, # }}} @log_process(logger, "build m2l translation classes") - def __call__(self, queue, trav, tree, wait_for=None, - is_translation_per_level=True): + def __call__(self, actx: PyOpenCLArrayContext, + trav, tree, wait_for=None, is_translation_per_level=True): """Returns a pair *info*, *evt* where info is a :class:`TranslationClassesInfo`. """ evt, translation_class_is_used, translation_classes_lists = \ - self.compute_translation_classes(queue, trav, tree, wait_for, + self.compute_translation_classes(actx, trav, tree, wait_for, is_translation_per_level) well_sep_is_n_away = trav.well_sep_is_n_away @@ -385,7 +395,7 @@ def __call__(self, queue, trav, tree, wait_for=None, prev_level = -1 from_sep_siblings_translation_classes_level_starts = \ np.empty(nlevels+1, dtype=np.int32) - for i, used in enumerate(translation_class_is_used.get()): + for i, used in enumerate(actx.to_numpy(translation_class_is_used)): cls_without_level = i % num_translation_classes level = i // num_translation_classes if (prev_level != level): @@ -403,14 +413,13 @@ def __call__(self, queue, trav, tree, wait_for=None, from_sep_siblings_translation_classes_level_starts[nlevels] = count - translation_classes_lists = ( - cl.array.take( - cl.array.to_device(queue, used_translation_classes_map), - translation_classes_lists)) + translation_classes_lists = actx.from_numpy( + used_translation_classes_map + )[translation_classes_lists] - distances = cl.array.to_device(queue, distances) - from_sep_siblings_translation_classes_level_starts = cl.array.to_device( - queue, from_sep_siblings_translation_classes_level_starts) + distances = actx.from_numpy(distances) + from_sep_siblings_translation_classes_level_starts = actx.from_numpy( + from_sep_siblings_translation_classes_level_starts) info = TranslationClassesInfo( traversal=trav, @@ -418,9 +427,9 @@ def __call__(self, queue, trav, tree, wait_for=None, from_sep_siblings_translation_class_to_distance_vector=distances, from_sep_siblings_translation_classes_level_starts=( from_sep_siblings_translation_classes_level_starts), - ).with_queue(None) + ) - return info, evt + return actx.freeze(info), evt # }}} From 959d3a7706cd1f42faf5c1e9090c95db36e70df1 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Mon, 20 Jun 2022 21:42:49 +0300 Subject: [PATCH 03/28] port rotation_classes to arraycontext --- boxtree/rotation_classes.py | 57 +++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py index 22c2b6b5..43e1b759 100644 --- a/boxtree/rotation_classes.py +++ b/boxtree/rotation_classes.py @@ -33,24 +33,25 @@ """ import logging +from dataclasses import dataclass import numpy as np +from arraycontext import Array -import pyopencl as cl -import pyopencl.array +from pytools import log_process -from boxtree.tools import DeviceDataRecord +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container from boxtree.translation_classes import TranslationClassesBuilder logger = logging.getLogger(__name__) -from pytools import log_process - # {{{ rotation classes builder -class RotationClassesInfo(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class RotationClassesInfo: r"""Interaction lists to help with matrix precomputations for rotation-based translations ("point and shoot"). @@ -75,6 +76,9 @@ class RotationClassesInfo(DeviceDataRecord): """ + from_sep_siblings_rotation_classes: Array + from_sep_siblings_rotation_class_to_angle: Array + @property def nfrom_sep_siblings_rotation_classes(self): return len(self.from_sep_siblings_rotation_class_to_angle) @@ -87,25 +91,24 @@ class RotationClassesBuilder: .. automethod:: __call__ """ - def __init__(self, context): - self.context = context - self.tcb = TranslationClassesBuilder(context) + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + self.tcb = TranslationClassesBuilder(array_context) @staticmethod - def vec_gcd(vec): + def vec_gcd(vec) -> int: """Return the GCD of a list of integers.""" - def gcd(a, b): - while b: - a, b = b, a % b - return a + import math + # TODO: math.gcd supports a list of integers from >= 3.9 result = abs(vec[0]) for elem in vec[1:]: - result = gcd(result, abs(elem)) + result = math.gcd(result, abs(elem)) + return result def compute_rotation_classes(self, - well_sep_is_n_away, dimensions, used_translation_classes): + well_sep_is_n_away: int, dimensions: int, used_translation_classes): """Convert translation classes to a list of rotation classes and angles.""" angle_to_rot_class = {} angles = [] @@ -154,11 +157,11 @@ def compute_rotation_classes(self, return translation_class_to_rot_class, angles @log_process(logger, "build m2l rotation classes") - def __call__(self, queue, trav, tree, wait_for=None): + def __call__(self, actx, trav, tree, wait_for=None): """Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`. """ evt, translation_class_is_used, translation_classes_lists = \ - self.tcb.compute_translation_classes(queue, trav, tree, wait_for, False) + self.tcb.compute_translation_classes(actx, trav, tree, wait_for, False) d = tree.dimensions n = trav.well_sep_is_n_away @@ -166,7 +169,7 @@ def __call__(self, queue, trav, tree, wait_for=None): # convert translation classes to rotation classes used_translation_classes = ( - np.flatnonzero(translation_class_is_used.get())) + np.flatnonzero(actx.to_numpy(translation_class_is_used))) translation_class_to_rotation_class, rotation_angles = ( self.compute_rotation_classes(n, d, used_translation_classes)) @@ -176,17 +179,17 @@ def __call__(self, queue, trav, tree, wait_for=None): # positions for list 2 boxes. assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d - rotation_classes_lists = ( - cl.array.take( - cl.array.to_device(queue, translation_class_to_rotation_class), - translation_classes_lists)) - - rotation_angles = cl.array.to_device(queue, np.array(rotation_angles)) + rotation_classes_lists = actx.from_numpy( + translation_class_to_rotation_class + )[translation_classes_lists] + rotation_angles = actx.from_numpy(np.array(rotation_angles)) - return RotationClassesInfo( + info = RotationClassesInfo( from_sep_siblings_rotation_classes=rotation_classes_lists, from_sep_siblings_rotation_class_to_angle=rotation_angles, - ).with_queue(None), evt + ) + + return actx.freeze(info), evt # }}} From bac678dad9f83c0f9b94bb2508aed042e0b9d244 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Mon, 20 Jun 2022 21:58:11 +0300 Subject: [PATCH 04/28] port area_query to arraycontext --- boxtree/area_query.py | 214 ++++++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 103 deletions(-) diff --git a/boxtree/area_query.py b/boxtree/area_query.py index 6804afd1..23f78ba2 100644 --- a/boxtree/area_query.py +++ b/boxtree/area_query.py @@ -24,22 +24,23 @@ import logging +from dataclasses import dataclass from functools import partial import numpy as np +from arraycontext import Array from mako.template import Template -import pyopencl as cl -import pyopencl.array -import pyopencl.cltypes +from pyopencl.elementwise import ElementwiseTemplate from pytools import ProcessLogger, memoize_method +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container from boxtree.tools import ( - AXIS_NAMES, - DeviceDataRecord, + InlineBinarySearch, coord_vec_subscript_code, get_coord_vec_dtype, ) +from boxtree.tree import Tree logger = logging.getLogger(__name__) @@ -82,7 +83,9 @@ # {{{ output -class PeerListLookup(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class PeerListLookup: """ .. attribute:: tree @@ -96,13 +99,17 @@ class PeerListLookup(DeviceDataRecord): .. attribute:: peer_lists - .. automethod:: get - .. versionadded:: 2016.1 """ + tree: Tree + peer_list_starts: Array + peer_lists: Array + -class AreaQueryResult(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class AreaQueryResult: """ .. attribute:: tree @@ -117,13 +124,17 @@ class AreaQueryResult(DeviceDataRecord): .. attribute:: leaves_near_ball_lists - .. automethod:: get - .. versionadded:: 2016.1 """ + tree: Tree + leaves_near_ball_starts: Array + leaves_near_ball_lists: Array + -class LeavesToBallsLookup(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class LeavesToBallsLookup: """ .. attribute:: tree @@ -140,10 +151,12 @@ class LeavesToBallsLookup(DeviceDataRecord): this list is indexed by the global box index. .. attribute:: balls_near_box_lists - - .. automethod:: get """ + tree: Tree + balls_near_box_starts: Array + balls_near_box_lists: Array + # }}} @@ -454,12 +467,6 @@ class LeavesToBallsLookup(DeviceDataRecord): """ - -from pyopencl.elementwise import ElementwiseTemplate - -from boxtree.tools import InlineBinarySearch - - STARTS_EXPANDER_TEMPLATE = ElementwiseTemplate( arguments=r""" idx_t *dst, @@ -546,6 +553,7 @@ def generate(self, context, from pyopencl.tools import dtype_to_ctype from boxtree import box_flags_enum + from boxtree.tools import AXIS_NAMES from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES from boxtree.tree_build import TreeBuilder render_vars = ( @@ -648,9 +656,13 @@ class AreaQueryBuilder: .. automethod:: __init__ .. automethod:: __call__ """ - def __init__(self, context): - self.context = context - self.peer_list_finder = PeerListFinder(self.context) + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + self.peer_list_finder = PeerListFinder(array_context) + + @property + def context(self): + return self._setup_actx.queue.context # {{{ Kernel generation @@ -660,12 +672,12 @@ def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype, from pyopencl.tools import dtype_to_ctype from boxtree import box_flags_enum - - logger.debug("start building area query kernel") - + from boxtree.tools import AXIS_NAMES from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE from boxtree.tree_build import TreeBuilder + logger.debug("start building area query kernel") + template = Template( TRAVERSAL_PREAMBLE_TEMPLATE + AREA_QUERY_TEMPLATE, @@ -722,20 +734,14 @@ def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype, # }}} - def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, wait_for=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` - :arg tree: a :class:`boxtree.Tree`. - :arg ball_centers: an object array of coordinate - :class:`pyopencl.array.Array` instances. - Their *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: a - :class:`pyopencl.array.Array` - of positive numbers. - Its *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` @@ -760,7 +766,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: - peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) + peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: @@ -773,7 +779,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, aq_plog = ProcessLogger(logger, "area query") result, evt = area_query_kernel( - queue, len(ball_radii), + actx.queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, @@ -785,10 +791,12 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, aq_plog.done() - return AreaQueryResult( + result = AreaQueryResult( tree=tree, leaves_near_ball_starts=result["leaves"].starts, - leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt + leaves_near_ball_lists=result["leaves"].lists) + + return actx.freeze(result), evt # }}} @@ -803,12 +811,16 @@ class LeavesToBallsLookupBuilder: .. automethod:: __call__ """ - def __init__(self, context): - self.context = context - + def __init__(self, array_context: PyOpenCLArrayContext): from pyopencl.algorithm import KeyValueSorter - self.key_value_sorter = KeyValueSorter(context) - self.area_query_builder = AreaQueryBuilder(context) + + self._setup_actx = array_context + self.key_value_sorter = KeyValueSorter(self.context) + self.area_query_builder = AreaQueryBuilder(array_context) + + @property + def context(self): + return self._setup_actx.queue.context @memoize_method def get_starts_expander_kernel(self, idx_dtype): @@ -823,20 +835,14 @@ def get_starts_expander_kernel(self, idx_dtype): self.context, type_aliases=(("idx_t", idx_dtype),)) - def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, wait_for=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` - :arg tree: a :class:`boxtree.Tree`. - :arg ball_centers: an object array of coordinate - :class:`pyopencl.array.Array` instances. - Their *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: a - :class:`pyopencl.array.Array` - of positive numbers. - Its *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` @@ -856,7 +862,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query") area_query, evt = self.area_query_builder( - queue, tree, ball_centers, ball_radii, peer_lists, wait_for) + actx, tree, ball_centers, ball_radii, peer_lists, wait_for) wait_for = [evt] logger.debug("leaves-to-balls lookup: expand starts") @@ -873,11 +879,11 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, # 2. Key-value sort the (ball number, box number) pairs by box number. starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype) - expanded_starts = cl.array.empty( - queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype) + expanded_starts = actx.np.zeros( + len(area_query.leaves_near_ball_lists), tree.box_id_dtype) evt = starts_expander_knl( expanded_starts, - area_query.leaves_near_ball_starts.with_queue(queue), + area_query.leaves_near_ball_starts, nballs_p_1) wait_for = [evt] @@ -885,20 +891,21 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, balls_near_box_starts, balls_near_box_lists, evt \ = self.key_value_sorter( - queue, + actx.queue, # keys - area_query.leaves_near_ball_lists.with_queue(queue), + area_query.leaves_near_ball_lists, # values expanded_starts, nkeys, starts_dtype=tree.box_id_dtype, wait_for=wait_for) - ltb_plog.done() - return LeavesToBallsLookup( + lookup = LeavesToBallsLookup( tree=tree, balls_near_box_starts=balls_near_box_starts, - balls_near_box_lists=balls_near_box_lists).with_queue(None), evt + balls_near_box_lists=balls_near_box_lists) + + return actx.freeze(lookup), evt # }}} @@ -927,9 +934,13 @@ class SpaceInvaderQueryBuilder: .. automethod:: __call__ """ - def __init__(self, context): - self.context = context - self.peer_list_finder = PeerListFinder(self.context) + def __init__(self, array_context: PyOpenCLArrayContext) -> None: + self._setup_actx = array_context + self.peer_list_finder = PeerListFinder(array_context) + + @property + def context(self): + return self._setup_actx.queue.context # {{{ Kernel generation @@ -946,30 +957,23 @@ def get_space_invader_query_kernel(self, dimensions, coord_dtype, # }}} - def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + ball_centers, ball_radii, peer_lists=None, wait_for=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` - :arg tree: a :class:`boxtree.Tree`. - :arg ball_centers: an object array of coordinate - :class:`pyopencl.array.Array` instances. - Their *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. - :arg ball_radii: a - :class:`pyopencl.array.Array` - of positive numbers. - Its *dtype* must match *tree*'s - :attr:`boxtree.Tree.coord_dtype`. + :arg ball_centers: an object array of coordinates. Their *dtype* must + match *tree*'s :attr:`boxtree.Tree.coord_dtype`. + :arg ball_radii: an array of positive numbers. Its *dtype* must match + *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of - :class:`PeerListLookup` associated with `tree`. + :class:`PeerListLookup` associated with *tree*. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. - :returns: a tuple *(sqi, event)*, where *sqi* is an instance of - :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event` - for dependency management. The *dtype* of *sqi* is - *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is - *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). + :returns: a tuple *(sqi, event)*, where *sqi* is an array and *event* + is a :class:`pyopencl.Event` for dependency management. The *dtype* + of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape + is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). The entries of *sqi* are indexed by the global box index and are as follows: @@ -990,7 +994,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: - peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) + peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: @@ -1002,7 +1006,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, si_plog = ProcessLogger(logger, "space invader query") - outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32) + outer_space_invader_dists = actx.zeros(tree.nboxes, np.float32) if not wait_for: wait_for = [] wait_for = (wait_for @@ -1017,7 +1021,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), wait_for=wait_for, - queue=queue, + queue=actx.queue, range=slice(len(ball_radii))) if tree.coord_dtype != np.dtype(np.float32): @@ -1062,8 +1066,12 @@ class PeerListFinder: .. automethod:: __call__ """ - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + + @property + def context(self): + return self._setup_actx.queue.context # {{{ Kernel generation @@ -1073,14 +1081,14 @@ def get_peer_list_finder_kernel(self, dimensions, coord_dtype, from pyopencl.tools import dtype_to_ctype from boxtree import box_flags_enum - - logger.debug("start building peer list finder kernel") - + from boxtree.tools import AXIS_NAMES from boxtree.traversal import ( HELPER_FUNCTION_TEMPLATE, TRAVERSAL_PREAMBLE_TEMPLATE, ) + logger.debug("start building peer list finder kernel") + template = Template( TRAVERSAL_PREAMBLE_TEMPLATE + HELPER_FUNCTION_TEMPLATE @@ -1130,10 +1138,8 @@ def get_peer_list_finder_kernel(self, dimensions, coord_dtype, # }}} - def __call__(self, queue, tree, wait_for=None): + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` - :arg tree: a :class:`boxtree.Tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. @@ -1153,7 +1159,7 @@ def __call__(self, queue, tree, wait_for=None): pl_plog = ProcessLogger(logger, "find peer lists") result, evt = peer_list_finder_kernel( - queue, tree.nboxes, + actx.queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, @@ -1161,10 +1167,12 @@ def __call__(self, queue, tree, wait_for=None): pl_plog.done() - return PeerListLookup( + lookup = PeerListLookup( tree=tree, peer_list_starts=result["peers"].starts, - peer_lists=result["peers"].lists).with_queue(None), evt + peer_lists=result["peers"].lists) + + return actx.freeze(lookup), evt # }}} From ca41e31afd5d5055e4ea18ee5c324c04ec4f1883 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Tue, 21 Jun 2022 20:06:48 +0300 Subject: [PATCH 05/28] port bounding_box to array_context --- boxtree/bounding_box.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/boxtree/bounding_box.py b/boxtree/bounding_box.py index 77cc51ce..804820a6 100644 --- a/boxtree/bounding_box.py +++ b/boxtree/bounding_box.py @@ -20,13 +20,12 @@ THE SOFTWARE. """ - import numpy as np -import pyopencl as cl # noqa from pyopencl.reduction import ReductionTemplate from pytools import memoize, memoize_method +from boxtree.array_context import PyOpenCLArrayContext from boxtree.tools import get_type_moniker @@ -121,17 +120,22 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype): class BoundingBoxFinder: - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context - for dev in context.devices: + for dev in self.context.devices: if (dev.vendor == "Intel(R) Corporation" and dev.version == "OpenCL 1.2 (Build 56860)"): raise RuntimeError("bounding box finder does not work " "properly with this CL runtime.") + @property + def context(self): + return self._setup_actx.queue.context + @memoize_method def get_kernel(self, dimensions, coord_dtype, have_radii): + # FIXME: Why does this just use `devices[0]`? bbox_dtype, _bbox_cdecl = make_bounding_box_dtype( self.context.devices[0], dimensions, coord_dtype) @@ -152,18 +156,18 @@ def get_kernel(self, dimensions, coord_dtype, have_radii): ) ) - def __call__(self, particles, radii, wait_for=None): + def __call__(self, actx, particles, radii, wait_for=None): dimensions = len(particles) from pytools import single_valued coord_dtype = single_valued(coord.dtype for coord in particles) - radii_tuple = () if radii is None else (radii,) - knl = self.get_kernel(dimensions, coord_dtype, - # have_radii: - radii is not None) - return knl(*(tuple(particles) + radii_tuple), - wait_for=wait_for, return_event=True) + + knl = self.get_kernel(dimensions, coord_dtype, have_radii=radii is not None) + return knl( + *(tuple(particles) + radii_tuple), + queue=actx.queue, + wait_for=wait_for, return_event=True) # }}} From 4355c48d6082495e69cebe4729be9dc5732d7b4c Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Tue, 21 Jun 2022 20:13:42 +0300 Subject: [PATCH 06/28] port fmm to array_context --- boxtree/fmm.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index d1eacd9a..3c4da9a1 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -30,14 +30,15 @@ import logging from abc import ABC, abstractmethod - -logger = logging.getLogger(__name__) from pytools import ProcessLogger from boxtree.traversal import FMMTraversalInfo from boxtree.tree import Tree +logger = logging.getLogger(__name__) + + # {{{ expansion wrangler interface class TreeIndependentDataForWrangler: @@ -113,8 +114,9 @@ class ExpansionWranglerInterface(ABC): .. automethod:: finalize_potentials """ - def __init__(self, tree_indep: TreeIndependentDataForWrangler, - traversal: FMMTraversalInfo): + def __init__(self, + tree_indep: TreeIndependentDataForWrangler, + traversal: FMMTraversalInfo) -> None: self.tree_indep = tree_indep self.traversal = traversal @@ -264,7 +266,7 @@ def finalize_potentials(self, potentials, template_ary): :class:`boxtree.pyfmmlib_integration.FMMLibExpansionWrangler` uses :class:`numpy.ndarray` internally, this array can be used to help convert the output back to the user's array - type (typically :class:`pyopencl.array.Array`). + type. """ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): @@ -368,8 +370,8 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. - fmm_proc = ProcessLogger(logger, "fmm") from boxtree.timing import TimingRecorder + fmm_proc = ProcessLogger(logger, "fmm") recorder = TimingRecorder() src_weight_vecs = [wrangler.reorder_sources(weight) for From 779a377920df6f9e8783aa2d22b5e5597bedf39d Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Tue, 21 Jun 2022 20:47:47 +0300 Subject: [PATCH 07/28] port traversal to arraycontext --- boxtree/traversal.py | 298 ++++++++++++++++++++++++------------------- 1 file changed, 166 insertions(+), 132 deletions(-) diff --git a/boxtree/traversal.py b/boxtree/traversal.py index f8f3d3b6..f1fd5ecf 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -34,30 +34,27 @@ THE SOFTWARE. """ +import enum import logging +from dataclasses import dataclass from functools import partial import numpy as np +from arraycontext import Array from mako.template import Template -import pyopencl as cl -import pyopencl.array -import pyopencl.cltypes -from pyopencl.elementwise import ElementwiseTemplate -from pytools import Record, memoize_method +from pyopencl.algorithm import ListOfListsBuilder +from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate +from pytools import ProcessLogger, log_process, memoize_method +from pytools.obj_array import make_obj_array -from boxtree.tools import ( - AXIS_NAMES, - DeviceDataRecord, - coord_vec_subscript_code, - get_coord_vec_dtype, -) +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container +from boxtree.tools import AXIS_NAMES, coord_vec_subscript_code, get_coord_vec_dtype +from boxtree.tree import Tree logger = logging.getLogger(__name__) -from pytools import ProcessLogger, log_process - # {{{ preamble @@ -1182,7 +1179,7 @@ name="merge_lists") -class _IndexStyle: +class _IndexStyle(enum.IntEnum): TARGET_BOXES = 0 TARGET_OR_TARGET_PARENT_BOXES = 1 @@ -1190,10 +1187,14 @@ class _IndexStyle: class _ListMerger: """Utility class for combining box lists optionally changing indexing style.""" - def __init__(self, context, box_id_dtype): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype): + self._setup_actx = array_context self.box_id_dtype = box_id_dtype + @property + def context(self): + return self._setup_actx.queue.context + @memoize_method def get_list_merger_kernel(self, nlists, write_counts): """ @@ -1213,7 +1214,7 @@ def get_list_merger_kernel(self, nlists, write_counts): ("write_counts", write_counts), )) - def __call__(self, queue, input_starts, input_lists, input_index_style, + def __call__(self, actx, input_starts, input_lists, input_index_style, output_index_style, target_boxes, target_or_target_parent_boxes, nboxes, debug=False, wait_for=None): """ @@ -1246,18 +1247,18 @@ def __call__(self, queue, input_starts, input_lists, input_index_style, and output_index_style == _IndexStyle.TARGET_BOXES): from boxtree.tools import reverse_index_array target_or_target_parent_boxes_from_all_boxes = reverse_index_array( - target_or_target_parent_boxes, target_size=nboxes, - queue=queue) - target_or_target_parent_boxes_from_target_boxes = cl.array.take( - target_or_target_parent_boxes_from_all_boxes, - target_boxes, queue=queue) + actx, target_or_target_parent_boxes, target_size=nboxes) + target_or_target_parent_boxes_from_target_boxes = ( + target_or_target_parent_boxes_from_all_boxes[target_boxes] + ) output_to_input_box = target_or_target_parent_boxes_from_target_boxes else: - output_to_input_box = cl.array.arange( - queue, noutput_boxes, dtype=self.box_id_dtype) + output_to_input_box = actx.from_numpy( + np.arange(noutput_boxes, dtype=self.box_id_dtype) + ) - new_counts = cl.array.empty(queue, noutput_boxes+1, self.box_id_dtype) + new_counts = actx.np.zeros(noutput_boxes + 1, self.box_id_dtype) assert len(input_starts) == len(input_lists) nlists = len(input_starts) @@ -1269,17 +1270,14 @@ def __call__(self, queue, input_starts, input_lists, input_index_style, # output: new_counts, range=slice(noutput_boxes), - queue=queue, + queue=actx.queue, wait_for=wait_for) - new_starts = cl.array.cumsum(new_counts) + import pyopencl.array as cl_array + new_starts = cl_array.cumsum(new_counts) del new_counts - new_lists = cl.array.empty( - queue, - int(new_starts[-1].get()), - self.box_id_dtype) - + new_lists = actx.np.zeros(int(actx.to_numpy(new_starts[-1])), self.box_id_dtype) new_lists.fill(999999999) evt = self.get_list_merger_kernel(nlists, False)( @@ -1291,7 +1289,7 @@ def __call__(self, queue, input_starts, input_lists, input_index_style, # output: new_lists, range=slice(noutput_boxes), - queue=queue, + queue=actx.queue, wait_for=[evt]) return {"starts": new_starts, "lists": new_lists}, evt @@ -1301,7 +1299,9 @@ def __call__(self, queue, input_starts, input_lists, input_index_style, # {{{ traversal info (output) -class FMMTraversalInfo(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class FMMTraversalInfo: r"""Interaction lists needed for a fast-multipole-like linear-time gather of particle interactions. @@ -1312,9 +1312,6 @@ class FMMTraversalInfo(DeviceDataRecord): Scientific and Statistical Computing 9, no. 4 (July 1988): 669-686. `DOI: 10.1137/0909044 `__. - Unless otherwise indicated, all bulk data in this data structure is stored - in a :class:`pyopencl.array.Array`. See also :meth:`get`. - .. attribute:: tree An instance of :class:`boxtree.Tree`. @@ -1421,16 +1418,6 @@ class FMMTraversalInfo(DeviceDataRecord): ``box_id_t [*]`` - Following attributes are deprecated. - - .. attribute:: colleagues_starts - - ``box_id_t [nboxes+1]`` - - .. attribute:: colleagues_lists - - ``box_id_t [*]`` - .. ------------------------------------------------------------------------ .. rubric:: Neighbor Sources ("List 1") .. ------------------------------------------------------------------------ @@ -1552,15 +1539,65 @@ class FMMTraversalInfo(DeviceDataRecord): Changed index style of *from_sep_close_bigger_starts* from :attr:`target_or_target_parent_boxes` to :attr:`target_boxes`. - - .. automethod:: get - .. automethod:: merge_close_lists """ + tree: Tree + well_sep_is_n_away: int + + # basic box lists for iteration + source_boxes: Array + target_boxes: Array + level_start_source_box_nrs: Array + level_start_target_box_nrs: Array + source_parent_boxes: Array + level_start_source_parent_box_nrs: Array + target_or_target_parent_boxes: Array + level_start_target_or_target_parent_box_nrs: Array + + # same-level non-well-separated boxes + same_level_non_well_sep_boxes_starts: Array + same_level_non_well_sep_boxes_lists: Array + + # neighbor sources ("List 1") + neighbor_source_boxes_starts: Array + neighbor_source_boxes_lists: Array + + # separated siblings ("List 2") + from_sep_siblings_starts: Array + from_sep_siblings_lists: Array + + # separated smaller boxes ("List 3") + from_sep_smaller_by_level: Array + target_boxes_sep_smaller_by_source_level: Array + from_sep_close_smaller_starts: Array + from_sep_close_smaller_lists: Array + + # separated bigger boxes ("List 4") + from_sep_bigger_starts: Array + from_sep_bigger_lists: Array + from_sep_close_bigger_starts: Array + from_sep_close_bigger_lists: Array + + @property + def nboxes(self): + return self.tree.nboxes + + @property + def nlevels(self): + return self.tree.nlevels + + @property + def ntarget_boxes(self): + return len(self.target_boxes) + + @property + def ntarget_or_target_parent_boxes(self): + return len(self.target_or_target_parent_boxes) + # {{{ "close" list merging -> "unified list 1" - def merge_close_lists(self, queue, debug=False): + def merge_close_lists(self, actx, debug=False): """Return a new :class:`FMMTraversalInfo` instance with the contents of :attr:`from_sep_close_smaller_starts` and :attr:`from_sep_close_bigger_starts` merged into @@ -1568,11 +1605,11 @@ def merge_close_lists(self, queue, debug=False): *None*. """ - list_merger = _ListMerger(queue.context, self.tree.box_id_dtype) + list_merger = _ListMerger(actx, self.tree.box_id_dtype) result, evt = ( list_merger( - queue, + actx, # starts (self.neighbor_source_boxes_starts, self.from_sep_close_smaller_starts, @@ -1591,11 +1628,13 @@ def merge_close_lists(self, queue, debug=False): self.tree.nboxes, debug)) + import pyopencl as cl cl.wait_for_events([evt]) - return self.copy( - neighbor_source_boxes_starts=result["starts"].with_queue(None), - neighbor_source_boxes_lists=result["lists"].with_queue(None), + from dataclasses import replace + return replace(self, + neighbor_source_boxes_starts=actx.freeze(result["starts"]), + neighbor_source_boxes_lists=actx.freeze(result["lists"]), from_sep_close_smaller_starts=None, from_sep_close_smaller_lists=None, from_sep_close_bigger_starts=None, @@ -1606,34 +1645,25 @@ def merge_close_lists(self, queue, debug=False): # {{{ debugging aids def get_box_list(self, what, index): - starts = getattr(self, what+"_starts") - lists = getattr(self, what+"_lists") + starts = getattr(self, f"{what}_starts") + lists = getattr(self, f"{what}_lists") start, stop = starts[index:index+2] return lists[start:stop] # }}} - @property - def nboxes(self): - return self.tree.nboxes - - @property - def nlevels(self): - return self.tree.nlevels - - @property - def ntarget_boxes(self): - return len(self.target_boxes) - - @property - def ntarget_or_target_parent_boxes(self): - return len(self.target_or_target_parent_boxes) - # }}} -class _KernelInfo(Record): - pass +@dataclass(frozen=True) +class _KernelInfo: + sources_parents_and_targets_builder: ListOfListsBuilder + level_start_box_nrs_extractor: ElementwiseKernel + same_level_non_well_sep_boxes_builder: ListOfListsBuilder + neighbor_source_boxes_builder: ListOfListsBuilder + from_sep_siblings_builder: ListOfListsBuilder + from_sep_smaller_builder: ListOfListsBuilder + from_sep_bigger_builder: ListOfListsBuilder class FMMTraversalBuilder: @@ -1641,7 +1671,9 @@ class FMMTraversalBuilder: .. automethod:: __init__ """ - def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None): + def __init__(self, array_context: PyOpenCLArrayContext, *, + well_sep_is_n_away=1, + from_sep_smaller_crit=None) -> None: """ :arg well_sep_is_n_away: Either An integer 1 or greater. (Only 1 and 2 are tested.) @@ -1657,10 +1689,14 @@ def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None): including their radii), or ``"static_l2"`` (use the circumcircle of the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`). """ - self.context = context + self._setup_actx = array_context self.well_sep_is_n_away = well_sep_is_n_away self.from_sep_smaller_crit = from_sep_smaller_crit + @property + def context(self): + return self._setup_actx.queue.context + # {{{ kernel builder @memoize_method @@ -1738,6 +1774,7 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype, "source_boxes_has_mask": source_boxes_has_mask, "source_parent_boxes_has_mask": source_parent_boxes_has_mask, } + from pyopencl.algorithm import ListOfListsBuilder from boxtree.tools import ScalarArg, VectorArg @@ -1873,13 +1910,12 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype, # {{{ driver - def __call__(self, queue, tree, wait_for=None, debug=False, + def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, + wait_for=None, debug=False, _from_sep_smaller_min_nsources_cumul=None, source_boxes_mask=None, source_parent_boxes_mask=None): """ - :arg queue: A :class:`pyopencl.CommandQueue` instance. - :arg tree: A :class:`boxtree.Tree` instance. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. @@ -1888,7 +1924,7 @@ def __call__(self, queue, tree, wait_for=None, debug=False, :arg source_parent_boxes_mask: Only boxes passing this mask will be considered for `source_parent_boxes`. Used by the distributed implementation. - :return: A tuple *(trav, event)*, where *trav* is a new instance of + :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event` for dependency management. """ @@ -1908,16 +1944,17 @@ def __call__(self, queue, tree, wait_for=None, debug=False, "traversal generation") # FIXME: missing on TreeOfBoxes + nlevels = actx.to_numpy(tree.nlevels) sources_are_targets = getattr(tree, "sources_are_targets", True) # Generated code shouldn't depend on the *exact* number of tree levels. # So round up to the next multiple of 5. from pytools import div_ceil - max_levels = div_ceil(tree.nlevels, 5) * 5 + max_levels = div_ceil(nlevels, 5) * 5 level_start_box_nrs = ( None if tree.level_start_box_nrs is None else - cl.array.to_device(queue, tree.level_start_box_nrs)) + tree.level_start_box_nrs) knl_info = self.get_kernel_info( dimensions=tree.dimensions, @@ -1933,9 +1970,9 @@ def __call__(self, queue, tree, wait_for=None, debug=False, source_boxes_has_mask=source_boxes_mask is not None, source_parent_boxes_has_mask=source_parent_boxes_mask is not None) - def fin_debug(s): + def debug_with_finish(s): if debug: - queue.finish() + actx.queue.finish() logger.debug(s) @@ -1943,7 +1980,8 @@ def fin_debug(s): # {{{ source boxes, their parents, and target boxes - fin_debug("building list of source boxes, their parents, and target boxes") + debug_with_finish( + "building list of source boxes, their parents, and target boxes") extra_args = [] if source_boxes_mask is not None: @@ -1952,7 +1990,7 @@ def fin_debug(s): extra_args.append(source_parent_boxes_mask) result, evt = knl_info.sources_parents_and_targets_builder( - queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for + actx.queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for ) wait_for = [evt] @@ -1974,43 +2012,44 @@ def extract_level_start_box_nrs(box_list, wait_for): if level_start_box_nrs is None: return None, [] - result = cl.array.empty(queue, - tree.nlevels+1, tree.box_id_dtype) \ - .fill(len(box_list)) + result = actx.np.zeros( + nlevels + 1, tree.box_id_dtype).fill(len(box_list)) + evt = knl_info.level_start_box_nrs_extractor( level_start_box_nrs, tree.box_levels, box_list, result, range=slice(0, len(box_list)), - queue=queue, wait_for=wait_for) + queue=actx.queue, wait_for=wait_for) - result = result.get() + result = actx.to_numpy(result) # Postprocess result for unoccupied levels prev_start = len(box_list) - for ilev in range(tree.nlevels-1, -1, -1): + for ilev in range(nlevels - 1, -1, -1): result[ilev] = prev_start = \ min(result[ilev], prev_start) return result, [evt] - fin_debug("finding level starts in source boxes array") + debug_with_finish("finding level starts in source boxes array") level_start_source_box_nrs, evt_s = \ extract_level_start_box_nrs( source_boxes, wait_for=wait_for) - fin_debug("finding level starts in source parent boxes array") + debug_with_finish("finding level starts in source parent boxes array") level_start_source_parent_box_nrs, evt_sp = \ extract_level_start_box_nrs( source_parent_boxes, wait_for=wait_for) - fin_debug("finding level starts in target boxes array") + debug_with_finish("finding level starts in target boxes array") level_start_target_box_nrs, evt_t = \ extract_level_start_box_nrs( target_boxes, wait_for=wait_for) - fin_debug("finding level starts in target or target parent boxes array") + debug_with_finish( + "finding level starts in target or target parent boxes array") level_start_target_or_target_parent_box_nrs, evt_tp = \ extract_level_start_box_nrs( target_or_target_parent_boxes, wait_for=wait_for) @@ -2024,10 +2063,10 @@ def extract_level_start_box_nrs(box_list, wait_for): # If well_sep_is_n_away is 1, this agrees with the definition of # 'colleagues' from the classical FMM literature. - fin_debug("finding same-level near-field boxes") + debug_with_finish("finding same-level near-field boxes") result, evt = knl_info.same_level_non_well_sep_boxes_builder( - queue, tree.nboxes, + actx.queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, wait_for=wait_for) @@ -2038,10 +2077,10 @@ def extract_level_start_box_nrs(box_list, wait_for): # {{{ neighbor source boxes ("list 1") - fin_debug("finding neighbor source boxes ('list 1')") + debug_with_finish("finding neighbor source boxes ('list 1')") result, evt = knl_info.neighbor_source_boxes_builder( - queue, len(target_boxes), + actx.queue, len(target_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, target_boxes, wait_for=wait_for) @@ -2053,10 +2092,10 @@ def extract_level_start_box_nrs(box_list, wait_for): # {{{ well-separated siblings ("list 2") - fin_debug("finding well-separated siblings ('list 2')") + debug_with_finish("finding well-separated siblings ('list 2')") result, evt = knl_info.from_sep_siblings_builder( - queue, len(target_or_target_parent_boxes), + actx.queue, len(target_or_target_parent_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, target_or_target_parent_boxes, tree.box_parent_ids.data, @@ -2072,10 +2111,10 @@ def extract_level_start_box_nrs(box_list, wait_for): # {{{ separated smaller ("list 3") - fin_debug("finding separated smaller ('list 3')") + debug_with_finish("finding separated smaller ('list 3')") from_sep_smaller_base_args = ( - queue, len(target_boxes), + actx.queue, len(target_boxes), # base_args tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, @@ -2094,8 +2133,8 @@ def extract_level_start_box_nrs(box_list, wait_for): from_sep_smaller_by_level = [] target_boxes_sep_smaller_by_source_level = [] - for ilevel in range(tree.nlevels): - fin_debug(f"finding separated smaller ('list 3 level {ilevel}')") + for ilevel in range(nlevels): + debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')") result, evt = knl_info.from_sep_smaller_builder( *from_sep_smaller_base_args, ilevel, @@ -2110,7 +2149,7 @@ def extract_level_start_box_nrs(box_list, wait_for): from_sep_smaller_wait_for.append(evt) if with_extent: - fin_debug("finding separated smaller close ('list 3 close')") + debug_with_finish("finding separated smaller close ('list 3 close')") result, evt = knl_info.from_sep_smaller_builder( *from_sep_smaller_base_args, -1, @@ -2131,10 +2170,10 @@ def extract_level_start_box_nrs(box_list, wait_for): # {{{ separated bigger ("list 4") - fin_debug("finding separated bigger ('list 4')") + debug_with_finish("finding separated bigger ('list 4')") result, evt = knl_info.from_sep_bigger_builder( - queue, len(target_or_target_parent_boxes), + actx.queue, len(target_or_target_parent_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, tree.stick_out_factor, target_or_target_parent_boxes, @@ -2152,9 +2191,9 @@ def extract_level_start_box_nrs(box_list, wait_for): from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists - list_merger = _ListMerger(queue.context, tree.box_id_dtype) + list_merger = _ListMerger(actx, tree.box_id_dtype) result, evt = list_merger( - queue, + actx, # starts (from_sep_close_bigger_starts_raw,), # lists @@ -2183,43 +2222,35 @@ def extract_level_start_box_nrs(box_list, wait_for): # }}} - if self.well_sep_is_n_away == 1: - colleagues_starts = same_level_non_well_sep_boxes.starts - colleagues_lists = same_level_non_well_sep_boxes.lists - else: - colleagues_starts = None - colleagues_lists = None - evt, = wait_for - traversal_plog.done( "from_sep_smaller_crit: %s", self.from_sep_smaller_crit) - return FMMTraversalInfo( + info = FMMTraversalInfo( tree=tree, well_sep_is_n_away=self.well_sep_is_n_away, source_boxes=source_boxes, target_boxes=target_boxes, - level_start_source_box_nrs=level_start_source_box_nrs, - level_start_target_box_nrs=level_start_target_box_nrs, + level_start_source_box_nrs=actx.from_numpy( + level_start_source_box_nrs), + level_start_target_box_nrs=actx.from_numpy( + level_start_target_box_nrs), source_parent_boxes=source_parent_boxes, - level_start_source_parent_box_nrs=level_start_source_parent_box_nrs, + level_start_source_parent_box_nrs=actx.from_numpy( + level_start_source_parent_box_nrs), target_or_target_parent_boxes=target_or_target_parent_boxes, - level_start_target_or_target_parent_box_nrs=( + level_start_target_or_target_parent_box_nrs=actx.from_numpy( level_start_target_or_target_parent_box_nrs), same_level_non_well_sep_boxes_starts=( same_level_non_well_sep_boxes.starts), same_level_non_well_sep_boxes_lists=( same_level_non_well_sep_boxes.lists), - # Deprecated, but we'll keep these alive for the time being. - colleagues_starts=colleagues_starts, - colleagues_lists=colleagues_lists, neighbor_source_boxes_starts=neighbor_source_boxes.starts, neighbor_source_boxes_lists=neighbor_source_boxes.lists, @@ -2227,8 +2258,9 @@ def extract_level_start_box_nrs(box_list, wait_for): from_sep_siblings_starts=from_sep_siblings.starts, from_sep_siblings_lists=from_sep_siblings.lists, - from_sep_smaller_by_level=from_sep_smaller_by_level, - target_boxes_sep_smaller_by_source_level=( + from_sep_smaller_by_level=make_obj_array( + from_sep_smaller_by_level), + target_boxes_sep_smaller_by_source_level=make_obj_array( target_boxes_sep_smaller_by_source_level), from_sep_close_smaller_starts=from_sep_close_smaller_starts, @@ -2239,7 +2271,9 @@ def extract_level_start_box_nrs(box_list, wait_for): from_sep_close_bigger_starts=from_sep_close_bigger_starts, from_sep_close_bigger_lists=from_sep_close_bigger_lists, - ).with_queue(None), evt + ) + + return actx.freeze(info), evt # }}} From f2140cabba248d19ff13ba4cf6dabce0f9756ef6 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Tue, 21 Jun 2022 21:13:31 +0300 Subject: [PATCH 08/28] port tree to arraycontext --- boxtree/tree.py | 324 ++++++++++++++++++------------------- test/test_tree_of_boxes.py | 5 +- 2 files changed, 163 insertions(+), 166 deletions(-) diff --git a/boxtree/tree.py b/boxtree/tree.py index ade9d965..857c04cc 100644 --- a/boxtree/tree.py +++ b/boxtree/tree.py @@ -81,16 +81,16 @@ """ import logging -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import cached_property import numpy as np +from arraycontext import Array -import pyopencl as cl from cgen import Enum from pytools import memoize_method -from boxtree.tools import DeviceDataRecord +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container logger = logging.getLogger(__name__) @@ -141,7 +141,8 @@ class box_flags_enum(Enum): # noqa # {{{ tree of boxes -@dataclass +@dataclass_array_container +@dataclass(frozen=True) class TreeOfBoxes: """A quad/octree tree of pure boxes, excluding their contents (e.g. particles). It is a lightweight tree handled with :mod:`numpy`, intended @@ -215,15 +216,15 @@ class TreeOfBoxes: .. automethod:: __init__ """ - root_extent: np.ndarray - box_centers: np.ndarray + root_extent: Array + box_centers: Array - box_parent_ids: np.ndarray - box_child_ids: np.ndarray - box_levels: np.ndarray + box_parent_ids: Array + box_child_ids: Array + box_levels: Array - box_flags: np.ndarray | None - level_start_box_nrs: np.ndarray | None + box_flags: Array | None + level_start_box_nrs: Array | None # FIXME: these should be properties and take values from box_parent_ids, etc box_id_dtype: np.dtype @@ -251,11 +252,7 @@ def aligned_nboxes(self): @property def nlevels(self): - # level starts from 0 - if isinstance(self.box_levels, cl.array.Array): - return int(max(self.box_levels).get()) + 1 - else: - return max(self.box_levels) + 1 + return max(self.box_levels) + 1 @property def leaf_boxes(self): @@ -288,7 +285,9 @@ def get_box_extent(self, ibox): # {{{ tree with particles -class Tree(DeviceDataRecord, TreeOfBoxes): +@dataclass_array_container +@dataclass(frozen=True) +class Tree(TreeOfBoxes): r"""A quad/octree consisting of particles sorted into a hierarchy of boxes. Optionally, particles may be designated 'sources' and 'targets'. They @@ -298,9 +297,6 @@ class Tree(DeviceDataRecord, TreeOfBoxes): Instances of this class are not constructed directly. They are returned by :meth:`TreeBuilder.__call__`. - Unless otherwise indicated, all bulk data in this data structure is stored - in a :class:`pyopencl.array.Array`. See also :meth:`get`. - Inherits from :class:`TreeOfBoxes`. .. rubric:: Flags @@ -379,13 +375,6 @@ class Tree(DeviceDataRecord, TreeOfBoxes): in each level, access the start of the next level. This array is built so that this works even for the last level. - .. attribute:: level_start_box_nrs_dev - - ``particle_id_t [nlevels+1]`` - - The same array as :attr:`level_start_box_nrs` - as a :class:`pyopencl.array.Array`. - .. ------------------------------------------------------------------------ .. rubric:: Per-particle arrays .. ------------------------------------------------------------------------ @@ -554,12 +543,43 @@ class Tree(DeviceDataRecord, TreeOfBoxes): .. attribute:: box_target_bounding_box_max ``coordt_t [dimensions, aligned_nboxes]`` - - .. rubric:: Methods - - .. automethod:: get """ + # flags + sources_are_targets: bool + + # data types + particle_id_dtype: np.dtype + + # per-particle arrays + sources: Array + source_radii: Array + targets: Array + target_radii: Array + + # FIXME: this needs to be init=True to overwrite the cached property in + # the base class. That fails because `x[:, 0] - c` tries to do arithmetic + # on a non-contiguous array and is not supported by pyopencl + bounding_box: tuple[Array, Array] = field(init=True) + + # tree / user order indices + user_source_ids: Array + sorted_target_ids: Array + + # box properties + box_source_starts: Array + box_source_counts_nonchild: Array + box_source_counts_cumul: Array + box_target_starts: Array + box_target_counts_nonchild: Array + box_target_counts_cumul: Array + + # particle-adaptive box extents + box_source_bounding_box_min: Array + box_source_bounding_box_max: Array + box_target_bounding_box_min: Array + box_target_bounding_box_max: Array + @property def dimensions(self): return len(self.sources) @@ -582,6 +602,8 @@ def ntargets(self): def nlevels(self): return len(self.level_start_box_nrs) - 1 + # {{{ dummy interface for TreePlotter + def plot(self, **kwargs): from boxtree.visualization import TreePlotter plotter = TreePlotter(self) @@ -595,9 +617,11 @@ def get_box_extent(self, ibox): extent_high = extent_low + box_size return extent_low, extent_high + # }}} + # {{{ debugging aids - # these assume numpy arrays (i.e. post-.get()), for now + # these assume numpy arrays for now def _reverse_index_lookup(self, ary, new_key_size): result = np.empty(new_key_size, ary.dtype) @@ -642,26 +666,13 @@ def find_box_nr_for_source(self, isource): # }}} - def to_device(self, queue, exclude_fields=frozenset()): - # level_start_box_nrs should remain in host memory - exclude_fields = set(exclude_fields) - exclude_fields.add("level_start_box_nrs") - - return super().to_device(queue, frozenset(exclude_fields)) - - def to_host_device_array(self, queue, exclude_fields=frozenset()): - # level_start_box_nrs should remain in host memory - exclude_fields = set(exclude_fields) - exclude_fields.add("level_start_box_nrs") - - return super().to_host_device_array( - queue, frozenset(exclude_fields)) - # }}} # {{{ tree with linked point sources +@dataclass_array_container +@dataclass(frozen=True) class TreeWithLinkedPointSources(Tree): """In this :class:`boxtree.Tree` subclass, the sources of the original tree are linked with extent are expanded into point sources which are linked to the @@ -724,20 +735,26 @@ class TreeWithLinkedPointSources(Tree): This constructor is not intended to be called by users directly. Call :func:`link_point_sources` instead. - - .. rubric:: Methods - - .. automethod:: get """ + npoint_sources: int + point_source_starts: Array + point_source_counts: Array + point_sources: Array + user_point_source_ids: Array + box_point_source_starts: Array + box_point_source_counts_nonchild: Array + box_point_source_counts_cumul: Array + -def link_point_sources(queue, tree, point_source_starts, point_sources, - debug=False): +def link_point_sources( + actx: PyOpenCLArrayContext, tree: Tree, + point_source_starts: Array, point_sources: Array, *, + debug: bool = False): r""" *Construction:* Requires that :attr:`boxtree.Tree.sources_have_extent` is *True* on *tree*. - :arg queue: a :class:`pyopencl.CommandQueue` instance :arg point_source_starts: ``point_source_starts[isrc]`` and ``point_source_starts[isrc+1]`` together indicate a ranges of point particle indices in *point_sources* which will be linked to the @@ -759,21 +776,21 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, if not tree.sources_have_extent: raise ValueError("only allowed on trees whose sources have extent") - npoint_sources_dev = cl.array.empty(queue, (), tree.particle_id_dtype) + npoint_sources_dev = actx.np.zeros((), tree.particle_id_dtype) # {{{ compute tree_order_point_source_{starts, counts} # Scan over lengths of point source lists in tree order to determine # indices of point source starts for each source. - tree_order_point_source_starts = cl.array.empty( - queue, tree.nsources, tree.particle_id_dtype) - tree_order_point_source_counts = cl.array.empty( - queue, tree.nsources, tree.particle_id_dtype) + tree_order_point_source_starts = actx.np.zeros( + tree.nsources, tree.particle_id_dtype) + tree_order_point_source_counts = actx.np.zeros( + tree.nsources, tree.particle_id_dtype) from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL knl = POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build( - queue.context, + actx.queue.context, type_aliases=( ("scan_t", tree.particle_id_dtype), ("index_t", tree.particle_id_dtype), @@ -785,39 +802,40 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, knl(point_source_starts, tree.user_source_ids, tree_order_point_source_starts, tree_order_point_source_counts, - npoint_sources_dev, size=tree.nsources, queue=queue) + npoint_sources_dev, size=tree.nsources, queue=actx.queue) # }}} - npoint_sources = int(npoint_sources_dev.get()) + npoint_sources = int(actx.to_numpy(npoint_sources_dev)) # {{{ compute user_point_source_ids # A list of point source starts, indexed in tree order, # but giving point source indices in user order. - tree_order_index_user_point_source_starts = cl.array.take( - point_source_starts, tree.user_source_ids, - queue=queue) + tree_order_index_user_point_source_starts = ( + point_source_starts[tree.user_source_ids]) - user_point_source_ids = cl.array.empty( - queue, npoint_sources, tree.particle_id_dtype) + user_point_source_ids = actx.np.zeros(npoint_sources, tree.particle_id_dtype) user_point_source_ids.fill(1) - cl.array.multi_put([tree_order_index_user_point_source_starts], + + import pyopencl.array as cl_array + cl_array.multi_put( + [tree_order_index_user_point_source_starts], dest_indices=tree_order_point_source_starts, out=[user_point_source_ids]) if debug: - ups_host = user_point_source_ids.get() - assert (ups_host >= 0).all() - assert (ups_host < npoint_sources).all() + ups_host = actx.to_numpy(user_point_source_ids) + assert np.all(ups_host >= 0) + assert np.all(ups_host < npoint_sources) - source_boundaries = cl.array.zeros(queue, npoint_sources, np.int8) + source_boundaries = actx.zeros(npoint_sources, np.int8) # FIXME: Should be a scalar, in principle. - ones = cl.array.empty(queue, tree.nsources, np.int8) + ones = actx.np.zeros(tree.nsources, np.int8) ones.fill(1) - cl.array.multi_put( + cl_array.multi_put( [ones], dest_indices=tree_order_point_source_starts, out=[source_boundaries]) @@ -829,7 +847,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, logger.debug("point source linking: point source id scan") knl = POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build( - queue.context, + actx.queue.context, type_aliases=( ("scan_t", tree.particle_id_dtype), ("index_t", tree.particle_id_dtype), @@ -837,19 +855,18 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, ), ) knl(source_boundaries, user_point_source_ids, - size=npoint_sources, queue=queue) + size=npoint_sources, queue=actx.queue) if debug: - ups_host = user_point_source_ids.get() - assert (ups_host >= 0).all() - assert (ups_host < npoint_sources).all() + ups_host = actx.to_numpy(user_point_source_ids) + assert np.all(ups_host >= 0) + assert np.all(ups_host < npoint_sources) # }}} from pytools.obj_array import make_obj_array tree_order_point_sources = make_obj_array([ - cl.array.take(point_sources[i], user_point_source_ids, - queue=queue) + cl_array.take(point_sources[i], user_point_source_ids, queue=actx.queue) for i in range(tree.dimensions) ]) @@ -858,7 +875,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES knl = POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build( - queue.context, + actx.queue.context, type_aliases=( ("particle_id_t", tree.particle_id_dtype), ("box_id_t", tree.box_id_dtype), @@ -867,12 +884,10 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, logger.debug("point source linking: box point sources") - box_point_source_starts = cl.array.empty( - queue, tree.nboxes, tree.particle_id_dtype) - box_point_source_counts_nonchild = cl.array.empty( - queue, tree.nboxes, tree.particle_id_dtype) - box_point_source_counts_cumul = cl.array.empty( - queue, tree.nboxes, tree.particle_id_dtype) + box_point_source_starts = actx.np.zeros(tree.nboxes, tree.particle_id_dtype) + box_point_source_counts_cumul = actx.np.zeros(tree.nboxes, tree.particle_id_dtype) + box_point_source_counts_nonchild = actx.np.zeros( + tree.nboxes, tree.particle_id_dtype) knl( box_point_source_starts, box_point_source_counts_nonchild, @@ -883,20 +898,21 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, tree_order_point_source_starts, tree_order_point_source_counts, - range=slice(tree.nboxes), queue=queue) + range=slice(tree.nboxes), queue=actx.queue) # }}} logger.info("point source linking: complete") + from dataclasses import fields tree_attrs = {} - for attr_name in tree.__class__.fields: + for f in fields(tree): try: # noqa: SIM105 - tree_attrs[attr_name] = getattr(tree, attr_name) + tree_attrs[f.name] = getattr(tree, f.name) except AttributeError: pass - return TreeWithLinkedPointSources( + tree_with_point_sources = TreeWithLinkedPointSources( npoint_sources=npoint_sources, point_source_starts=tree_order_point_source_starts, point_source_counts=tree_order_point_source_counts, @@ -906,7 +922,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, box_point_source_counts_nonchild=box_point_source_counts_nonchild, box_point_source_counts_cumul=box_point_source_counts_cumul, - **tree_attrs).with_queue(None) + **tree_attrs) + + return actx.freeze(tree_with_point_sources) # }}} @@ -914,7 +932,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources, # {{{ particle list filter -class FilteredTargetListsInUserOrder(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class FilteredTargetListsInUserOrder: """Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create instances of this class. @@ -947,14 +967,16 @@ class FilteredTargetListsInUserOrder(DeviceDataRecord): child boxes). Use together with :attr:`target_starts`. Target numbers are stored in user order, as the class name suggests. - - .. rubric:: Methods - - .. automethod:: get """ + nfiltered_targets: int + target_starts: Array + target_lists: Array + -class FilteredTargetListsInTreeOrder(DeviceDataRecord): +@dataclass_array_container +@dataclass(frozen=True) +class FilteredTargetListsInTreeOrder: """Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create instances of this class. @@ -999,12 +1021,14 @@ class FilteredTargetListsInTreeOrder(DeviceDataRecord): Storing *to* these indices will reorder the targets from *filtered* tree target order into 'regular' :ref:`tree target order `. - - .. rubric:: Methods - - .. automethod:: get """ + nfiltered_targets: int + box_target_starts: Array + box_target_counts_nonchild: Array + targets: Array + unfiltered_from_filtered_target_indices: Array + class ParticleListFilter: """ @@ -1012,8 +1036,12 @@ class ParticleListFilter: .. automethod:: filter_target_lists_in_user_order """ - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + + @property + def context(self): + return self._setup_actx.queue.context @memoize_method def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype, @@ -1055,7 +1083,7 @@ def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype, return builder - def filter_target_lists_in_user_order(self, queue, tree, flags): + def filter_target_lists_in_user_order(self, actx, tree, flags): """ :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of :class:`numpy.int8` objects, which indicate by being zero that the @@ -1067,25 +1095,27 @@ def filter_target_lists_in_user_order(self, queue, tree, flags): user_order_flags = flags del flags - user_target_ids = cl.array.empty(queue, tree.ntargets, - tree.sorted_target_ids.dtype) - user_target_ids[tree.sorted_target_ids] = cl.array.arange( - queue, tree.ntargets, user_target_ids.dtype) + user_target_ids = actx.np.zeros(tree.ntargets, tree.sorted_target_ids.dtype) + user_target_ids[tree.sorted_target_ids] = actx.from_numpy( + np.arange(tree.ntargets, dtype=user_target_ids.dtype) + ) kernel = self.get_filter_target_lists_in_user_order_kernel( tree.particle_id_dtype, user_order_flags.dtype) - result, _evt = kernel(queue, tree.nboxes, + result, _evt = kernel(actx.queue, tree.nboxes, user_order_flags, user_target_ids, tree.box_target_starts, tree.box_target_counts_nonchild) - return FilteredTargetListsInUserOrder( + target_lists = FilteredTargetListsInUserOrder( nfiltered_targets=result["filt_tgt_list"].count, target_starts=result["filt_tgt_list"].starts, target_lists=result["filt_tgt_list"].lists, - ).with_queue(None) + ) + + return actx.freeze(target_lists) @memoize_method def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype): @@ -1111,7 +1141,7 @@ def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype): return scan_knl, index_knl - def filter_target_lists_in_tree_order(self, queue, tree, flags): + def filter_target_lists_in_tree_order(self, actx, tree, flags): """ :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of :class:`numpy.int8` objects, which indicate by being zero that the @@ -1120,15 +1150,15 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags): :returns: A :class:`FilteredTargetListsInTreeOrder` """ - tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8) + tree_order_flags = actx.np.zeros(tree.ntargets, np.int8) tree_order_flags[tree.sorted_target_ids] = flags - filtered_from_unfiltered_target_indices = cl.array.empty( - queue, tree.ntargets, tree.particle_id_dtype) - unfiltered_from_filtered_target_indices = cl.array.empty( - queue, tree.ntargets, tree.particle_id_dtype) + filtered_from_unfiltered_target_indices = actx.np.zeros( + tree.ntargets, tree.particle_id_dtype) + unfiltered_from_filtered_target_indices = actx.np.zeros( + tree.ntargets, tree.particle_id_dtype) - nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype) + nfiltered_targets = actx.np.zeros(1, tree.particle_id_dtype) scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels( tree.particle_id_dtype) @@ -1137,23 +1167,21 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags): filtered_from_unfiltered_target_indices, unfiltered_from_filtered_target_indices, nfiltered_targets, - queue=queue) - - nfiltered_targets = int(nfiltered_targets.get().item()) + queue=actx.queue) + nfiltered_targets = int(actx.to_numpy(nfiltered_targets).item()) unfiltered_from_filtered_target_indices = \ unfiltered_from_filtered_target_indices[:nfiltered_targets] from pytools.obj_array import make_obj_array filtered_targets = make_obj_array([ - targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices] + actx.thaw(targets_i)[unfiltered_from_filtered_target_indices] for targets_i in tree.targets ]) - box_target_starts_filtered = \ - cl.array.empty_like(tree.box_target_starts) - box_target_counts_nonchild_filtered = \ - cl.array.empty_like(tree.box_target_counts_nonchild) + box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts) + box_target_counts_nonchild_filtered = ( + actx.np.zeros_like(tree.box_target_counts_nonchild)) index_knl( # input @@ -1167,51 +1195,19 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags): box_target_starts_filtered, box_target_counts_nonchild_filtered, - queue=queue) + queue=actx.queue) - return FilteredTargetListsInTreeOrder( + target_lists = FilteredTargetListsInTreeOrder( nfiltered_targets=nfiltered_targets, box_target_starts=box_target_starts_filtered, box_target_counts_nonchild=box_target_counts_nonchild_filtered, unfiltered_from_filtered_target_indices=( unfiltered_from_filtered_target_indices), targets=filtered_targets, - ).with_queue(None) - -# }}} - - -# {{{ filter_target_lists_in_*_order - -def filter_target_lists_in_user_order(queue, tree, flags): - """ - Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_user_order`. - """ - - from warnings import warn - warn( - "filter_target_lists_in_user_order() is deprecated and will go " - "away in a future release. Use " - "ParticleListFilter.filter_target_lists_in_user_order() instead.", - DeprecationWarning, stacklevel=2) - - return (ParticleListFilter(queue.context) - .filter_target_lists_in_user_order(queue, tree, flags)) + ) + return actx.freeze(target_lists) -def filter_target_lists_in_tree_order(queue, tree, flags): - """ - Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_tree_order`. - """ - from warnings import warn - warn( - "filter_target_lists_in_tree_order() is deprecated and will go " - "away in a future release. Use " - "ParticleListFilter.filter_target_lists_in_tree_order() instead.", - DeprecationWarning, stacklevel=2) - - return (ParticleListFilter(queue.context) - .filter_target_lists_in_tree_order(queue, tree, flags)) # }}} # vim: filetype=pyopencl:fdm=marker diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py index 33bdefd5..b7e798a3 100644 --- a/test/test_tree_of_boxes.py +++ b/test/test_tree_of_boxes.py @@ -228,11 +228,12 @@ def test_traversal_from_tob(actx_factory): box_child_ids=actx.from_numpy(tob.box_child_ids), box_levels=actx.from_numpy(tob.box_levels), box_flags=actx.from_numpy(tob.box_flags), + level_start_box_nrs=actx.from_numpy(tob.level_start_box_nrs), ) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context) - _trav, _ = tg(actx.queue, tob) + tg = FMMTraversalBuilder(actx) + _trav, _ = tg(actx, tob) # }}} From e6b8c812a77f5822d08659ae0448b3d280033e71 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 22 Jun 2022 14:28:51 +0300 Subject: [PATCH 09/28] port tools to arraycontext --- boxtree/tools.py | 112 +++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/boxtree/tools.py b/boxtree/tools.py index 0af213d6..ab4240c6 100644 --- a/boxtree/tools.py +++ b/boxtree/tools.py @@ -33,6 +33,8 @@ from pytools import Record, memoize_method from pytools.obj_array import make_obj_array +from boxtree.array_context import PyOpenCLArrayContext + # Use offsets in VectorArg by default. VectorArg = partial(_VectorArg, with_offset=True) @@ -46,26 +48,25 @@ def padded_bin(i, nbits): # NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__() -def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=None): +def realloc_array(actx, new_shape, ary, zero_fill=False, wait_for=None): if wait_for is None: wait_for = [] - if zero_fill: # noqa: SIM108 - array_maker = cl.array.zeros - else: - array_maker = cl.array.empty - - new_ary = array_maker(queue, shape=new_shape, dtype=ary.dtype, - allocator=allocator) + if not zero_fill: + from warnings import warn + warn("Setting 'zero_fill=False' has no effect and will become an error " + "in 2025. Always use 'zero_fill=True'", + DeprecationWarning, stacklevel=2) - evt = cl.enqueue_copy(queue, new_ary.data, ary.data, byte_count=ary.nbytes, - wait_for=wait_for + new_ary.events) + new_ary = actx.np.zeros(shape=new_shape, dtype=ary.dtype) + evt = cl.enqueue_copy(actx.queue, new_ary.data, ary.data, + byte_count=ary.nbytes, + wait_for=wait_for + new_ary.events) return new_ary, evt -def reverse_index_array(indices, target_size=None, result_fill_value=None, - queue=None): +def reverse_index_array(actx, indices, target_size=None, result_fill_value=None): """For an array of *indices*, return a new array *result* that satisfies ``result[indices] == arange(len(indices))`` @@ -75,38 +76,34 @@ def reverse_index_array(indices, target_size=None, result_fill_value=None, prior to storing reversed indices. """ - queue = queue or indices.queue - if target_size is None: target_size = len(indices) - result = cl.array.empty(queue, target_size, indices.dtype) + result = actx.np.zeros(target_size, indices.dtype) if result_fill_value is not None: result.fill(result_fill_value) cl.array.multi_put( - [cl.array.arange(queue, len(indices), dtype=indices.dtype, - allocator=indices.allocator)], + [actx.from_numpy(np.arange(len(indices), dtype=indices.dtype))], indices, out=[result], - queue=queue) + queue=actx.queue) return result # {{{ particle distribution generators -def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15): - from pyopencl.clrandom import PhiloxGenerator - rng = PhiloxGenerator(queue.context, seed=seed) - +def make_normal_particle_array(actx, nparticles, dims, dtype, seed=15): + rng = np.random.default_rng(seed) return make_obj_array([ - rng.normal(queue, nparticles, dtype=dtype) - for i in range(dims)]) + actx.from_numpy(rng.standard_normal(nparticles, dtype=dtype)) + for i in range(dims) + ]) -def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15): +def make_surface_particle_array(actx, nparticles, dims, dtype, seed=15): import loopy as lp if dims == 2: @@ -132,7 +129,7 @@ def get_2d_knl(dtype): return knl.executor(queue.context) - _evt, result = get_2d_knl(dtype)(queue, n=nparticles) + _evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles) result = [x.ravel() for x in result] @@ -166,7 +163,7 @@ def get_3d_knl(dtype): return knl.executor(queue.context) - _evt, result = get_3d_knl(dtype)(queue, n=n) + _evt, result = get_3d_knl(dtype)(actx.queue, n=n) result = [x.ravel() for x in result] @@ -175,7 +172,7 @@ def get_3d_knl(dtype): raise NotImplementedError -def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15): +def make_uniform_particle_array(actx, nparticles, dims, dtype, seed=15): import loopy as lp if dims == 2: @@ -209,7 +206,7 @@ def get_2d_knl(dtype): return knl.executor(queue.context) - _evt, result = get_2d_knl(dtype)(queue, n=n) + _evt, result = get_2d_knl(dtype)(actx.queue, n=n) result = [x.ravel() for x in result] @@ -257,7 +254,7 @@ def get_3d_knl(dtype): return knl.executor(queue.context) - _evt, result = get_3d_knl(dtype)(queue, n=n) + _evt, result = get_3d_knl(dtype)(actx.queue, n=n) result = [x.ravel() for x in result] @@ -266,14 +263,14 @@ def get_3d_knl(dtype): raise NotImplementedError -def make_rotated_uniform_particle_array(queue, nparticles, dims, dtype, seed=15): +def make_rotated_uniform_particle_array(actx, nparticles, dims, dtype, seed=15): raise NotImplementedError # }}} -def particle_array_to_host(parray): - return np.array([x.get() for x in parray], order="F").T +def particle_array_to_host(actx, particles): + return np.array([actx.to_numpy(x) for x in particles], order="F").T # {{{ host/device data storage @@ -458,8 +455,12 @@ def get_type_moniker(dtype): class GappyCopyAndMapKernel: - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + + @property + def context(self): + return self._setup_actx.queue.context @memoize_method def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, @@ -497,7 +498,7 @@ def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, name="gappy_copy_and_map") # NOTE: Order of positional args should match realloc_array() - def __call__(self, queue, allocator, new_shape, ary, src_indices=None, + def __call__(self, actx, new_shape, ary, src_indices=None, dst_indices=None, map_values=None, zero_fill=False, wait_for=None, range=None, debug=False): """Compresses box info arrays after empty leaf pruning and, optionally, @@ -519,19 +520,19 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None, elif have_src_indices: range = slice(src_indices.shape[0]) if debug: - assert int(cl.array.max(src_indices).get()) < len(ary) + assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary) elif have_dst_indices: range = slice(dst_indices.shape[0]) if debug: - assert int(cl.array.max(dst_indices).get()) < new_shape - - if zero_fill: # noqa: SIM108 - array_maker = cl.array.zeros - else: - array_maker = cl.array.empty + assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape - result = array_maker(queue, new_shape, ary.dtype, allocator=allocator) + if not zero_fill: + from warnings import warn + warn("Setting 'zero_fill=False' has no effect and will become an error " + "in 2025. Always use 'zero_fill=True'", + DeprecationWarning, stacklevel=2) + result = actx.np.zeros(new_shape, ary.dtype) kernel = self._get_kernel(ary.dtype, src_indices.dtype if have_src_indices else None, dst_indices.dtype if have_dst_indices else None, @@ -544,7 +545,7 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None, args += (dst_indices,) if have_dst_indices else () args += (map_values,) if have_map_values else () - evt = kernel(*args, queue=queue, range=range, wait_for=wait_for) + evt = kernel(*args, queue=actx.queue, range=range, wait_for=wait_for) return result, evt @@ -569,9 +570,12 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None, class MapValuesKernel: + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context - def __init__(self, context): - self.context = context + @property + def context(self): + return self._setup_actx.queue.context @memoize_method def _get_kernel(self, dst_dtype, src_dtype): @@ -685,8 +689,12 @@ class MaskCompressorKernel: """ .. automethod:: __call__ """ - def __init__(self, context): - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext): + self._setup_actx = array_context + + @property + def context(self): + return self._setup_actx.context @memoize_method def get_list_compressor_kernel(self, mask_dtype, list_dtype): @@ -717,7 +725,7 @@ def get_matrix_compressor_kernel(self, mask_dtype, list_dtype): ], name_prefix="compress_matrix") - def __call__(self, queue, mask, list_dtype=None): + def __call__(self, actx, mask, list_dtype=None): """Convert a mask to a list in :ref:`csr` format. :arg mask: Either a 1D or 2D array. @@ -739,7 +747,7 @@ def __call__(self, queue, mask, list_dtype=None): if len(mask.shape) == 1: knl = self.get_list_compressor_kernel(mask.dtype, list_dtype) - result, evt = knl(queue, mask.shape[0], mask.data) + result, evt = knl(actx.queue, mask.shape[0], mask.data) return (result["output"].lists, evt) elif len(mask.shape) == 2: # FIXME: This is efficient for small column sizes but may not be @@ -747,7 +755,7 @@ def __call__(self, queue, mask, list_dtype=None): knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype) size = mask.dtype.itemsize assert size > 0 - result, evt = knl(queue, mask.shape[0], mask.shape[1], + result, evt = knl(actx.queue, mask.shape[0], mask.shape[1], mask.strides[0] // size, mask.strides[1] // size, mask.data) return (result["output"].starts, result["output"].lists, evt) From c0d02d403c796506c3d9238cecd5075431c7728c Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 22 Jun 2022 14:29:05 +0300 Subject: [PATCH 10/28] port tree_build to arraycontext --- boxtree/tree_build.py | 392 ++++++++++++++++++++++-------------------- 1 file changed, 207 insertions(+), 185 deletions(-) diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py index f16c1221..d91f1e89 100644 --- a/boxtree/tree_build.py +++ b/boxtree/tree_build.py @@ -42,17 +42,15 @@ THE SOFTWARE. """ - import logging from functools import partial from itertools import pairwise import numpy as np -import pyopencl as cl -import pyopencl.array from pytools import DebugProcessLogger, ProcessLogger, memoize_method +from boxtree.array_context import PyOpenCLArrayContext from boxtree.tree import Tree @@ -71,26 +69,26 @@ class TreeBuilder: .. automethod:: __call__ """ - def __init__(self, context): - """ - :arg context: A :class:`pyopencl.Context`. - """ + morton_nr_dtype = np.dtype(np.int8) + box_level_dtype = np.dtype(np.uint8) + ROOT_EXTENT_STRETCH_FACTOR = 1e-4 - self.context = context + def __init__(self, array_context: PyOpenCLArrayContext) -> None: + self._setup_actx = array_context from boxtree.bounding_box import BoundingBoxFinder - self.bbox_finder = BoundingBoxFinder(self.context) + self.bbox_finder = BoundingBoxFinder(array_context) # This is used to map box IDs and compress box lists in empty leaf # pruning. from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel - self.gappy_copy_and_map = GappyCopyAndMapKernel(self.context) - self.map_values_kernel = MapValuesKernel(self.context) + self.gappy_copy_and_map = GappyCopyAndMapKernel(array_context) + self.map_values_kernel = MapValuesKernel(array_context) - morton_nr_dtype = np.dtype(np.int8) - box_level_dtype = np.dtype(np.uint8) - ROOT_EXTENT_STRETCH_FACTOR = 1e-4 + @property + def context(self): + return self._setup_actx.queue.context @memoize_method def get_kernel_info(self, dimensions, coord_dtype, @@ -107,7 +105,7 @@ def get_kernel_info(self, dimensions, coord_dtype, # {{{ run control - def __call__(self, queue, particles, kind="adaptive", + def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive", max_particles_in_box=None, allocator=None, debug=False, targets=None, source_radii=None, target_radii=None, stick_out_factor=None, refine_weights=None, @@ -115,7 +113,6 @@ def __call__(self, queue, particles, kind="adaptive", extent_norm=None, bbox=None, **kwargs): """ - :arg queue: a :class:`pyopencl.CommandQueue` instance :arg particles: an object array of (XYZ) point coordinate arrays. :arg kind: One of the following strings: @@ -129,15 +126,14 @@ def __call__(self, queue, particles, kind="adaptive", :arg targets: an object array of (XYZ) point coordinate arrays or ``None``. If ``None``, *particles* act as targets, too. Must have the same (inner) dtype as *particles*. - :arg source_radii: If not *None*, a :class:`pyopencl.array.Array` of the - same dtype as *particles*. + :arg source_radii: If not *None*, an arra of the same dtype as *particles*. If this is given, *targets* must also be given, i.e. sources and targets must be separate. See :ref:`extent`. :arg target_radii: Like *source_radii*, but for targets. :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`. - :arg refine_weights: If not *None*, a :class:`pyopencl.array.Array` of the + :arg refine_weights: If not *None*, an array of the type :class:`numpy.int32`. A box will be split if it has a cumulative refine_weight greater than *max_leaf_refine_weight*. If this is given, *max_leaf_refine_weight* must also be given and *max_particles_in_box* @@ -170,6 +166,12 @@ def __call__(self, queue, particles, kind="adaptive", management. """ + if allocator is not None: + from warnings import warn + warn("Passing in 'allocator' is deprecated. The allocator of the " + "array context 'actx' is used throughout.", + DeprecationWarning, stacklevel=2) + # {{{ input processing if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]: @@ -241,19 +243,21 @@ def __call__(self, queue, particles, kind="adaptive", # }}} - empty = partial(cl.array.empty, queue, allocator=allocator) - def zeros(shape, dtype): - result = cl.array.zeros(queue, shape, dtype, allocator=allocator) + result = actx.zeros(shape, dtype) + if result.events: event, = result.events else: from numbers import Number if isinstance(shape, Number): shape = (shape,) + from pytools import product assert product(shape) == 0 - event = cl.enqueue_marker(queue) + + from pyopencl import enqueue_marker + event = enqueue_marker(actx.queue) return result, event @@ -277,7 +281,7 @@ def zeros(shape, dtype): else: from pytools.obj_array import make_obj_array srcntgts = make_obj_array([ - p.with_queue(queue).copy() for p in particles + actx.np.copy(actx.thaw(p)) for p in particles ]) assert source_radii is None @@ -301,7 +305,7 @@ def zeros(shape, dtype): def combine_srcntgt_arrays(ary1, ary2=None): dtype = ary1.dtype if ary2 is None else ary2.dtype - result = empty(nsrcntgts, dtype) + result = actx.np.zeros(nsrcntgts, dtype) if (ary1 is None) or (ary2 is None): result.fill(0) @@ -329,8 +333,9 @@ def combine_srcntgt_arrays(ary1, ary2=None): del particles - user_srcntgt_ids = cl.array.arange(queue, nsrcntgts, dtype=particle_id_dtype, - allocator=allocator) + user_srcntgt_ids = actx.from_numpy( + np.arange(nsrcntgts, dtype=particle_id_dtype) + ) evt, = user_srcntgt_ids.events wait_for.append(evt) @@ -353,28 +358,31 @@ def combine_srcntgt_arrays(ary1, ary2=None): raise ValueError("must specify either max_particles_in_box or " "refine_weights/max_leaf_refine_weight") elif specified_max_particles_in_box: - refine_weights = ( - cl.array.empty( - queue, nsrcntgts, refine_weight_dtype, allocator=allocator) - .fill(1)) - event, = refine_weights.events - prep_events.append(event) + refine_weights = actx.np.zeros(nsrcntgts, refine_weight_dtype) + refine_weights.fill(1) + + prep_events.extend(refine_weights.events) max_leaf_refine_weight = max_particles_in_box elif specified_refine_weights: # noqa: SIM102 if refine_weights.dtype != refine_weight_dtype: raise TypeError( f"refine_weights must have dtype '{refine_weight_dtype}'") - if max_leaf_refine_weight < cl.array.max(refine_weights).get(): + if max_leaf_refine_weight <= 0: + raise ValueError("max_leaf_refine_weight must be positive") + + max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights)) + if max_leaf_refine_weight < max_refine_weights: raise ValueError( "entries of refine_weights cannot exceed max_leaf_refine_weight") - if cl.array.min(refine_weights).get() < 0: + + min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights)) + if min_refine_weights < 0: raise ValueError("all entries of refine_weights must be nonnegative") - if max_leaf_refine_weight <= 0: - raise ValueError("max_leaf_refine_weight must be positive") - total_refine_weight = cl.array.sum( - refine_weights, dtype=np.dtype(np.int64)).get() + total_refine_weight = actx.to_numpy( + actx.np.sum(refine_weights, dtype=np.dtype(np.int64)) + ) del max_particles_in_box del specified_max_particles_in_box @@ -384,10 +392,12 @@ def combine_srcntgt_arrays(ary1, ary2=None): # {{{ find and process bounding box - if bbox is None: - bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for) - bbox = bbox.get() + bbox_auto, _ = self.bbox_finder( + actx, srcntgts, srcntgt_radii, wait_for=wait_for) + bbox_auto = actx.to_numpy(bbox_auto) + if bbox is None: + bbox = bbox_auto root_extent = max( bbox["max_"+ax] - bbox["min_"+ax] for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR) @@ -403,10 +413,6 @@ def combine_srcntgt_arrays(ary1, ary2=None): bbox["max_"+ax] = bbox_max[i] else: # Validate that bbox is a superset of particle-derived bbox - bbox_auto, _ = self.bbox_finder( - srcntgts, srcntgt_radii, wait_for=wait_for) - bbox_auto = bbox_auto.get() - # Convert unstructured numpy array to bbox_type if isinstance(bbox, np.ndarray): if len(bbox) == dimensions: @@ -447,11 +453,12 @@ def combine_srcntgt_arrays(ary1, ary2=None): # box-local morton bin counts for each particle at the current level # only valid from scan -> split'n'sort - morton_bin_counts = empty(nsrcntgts, dtype=knl_info.morton_bin_count_dtype) + morton_bin_counts = actx.np.zeros( + nsrcntgts, dtype=knl_info.morton_bin_count_dtype) # (local) morton nrs for each particle at the current level # only valid from scan -> split'n'sort - morton_nrs = empty(nsrcntgts, dtype=self.morton_nr_dtype) + morton_nrs = actx.np.zeros(nsrcntgts, dtype=self.morton_nr_dtype) # 0/1 segment flags # invariant to sorting once set @@ -528,8 +535,7 @@ def combine_srcntgt_arrays(ary1, ary2=None): prep_events.append(evt) # Initialize box 0 to contain all particles - box_srcntgt_counts_cumul[0].fill( - nsrcntgts, queue=queue, wait_for=[evt]) + box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue, wait_for=[evt]) # box -> whether the box has a child. FIXME: use smaller integer type box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32)) @@ -543,8 +549,10 @@ def combine_srcntgt_arrays(ary1, ary2=None): prep_events.append(evt) # set parent of root box to itself - evt = cl.enqueue_copy( - queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype)) + from pyopencl import enqueue_copy + evt = enqueue_copy( + actx.queue, box_parent_ids.data, + np.zeros((), dtype=box_parent_ids.dtype)) prep_events.append(evt) # 2*(num bits in the significand) @@ -562,9 +570,9 @@ def combine_srcntgt_arrays(ary1, ary2=None): # }}} - def fin_debug(s): + def debug_with_finish(s): if debug: - queue.finish() + actx.queue.finish() logger.debug(s) @@ -625,6 +633,7 @@ def fin_debug(s): # regarding this). This flag is set to True when that happens. final_level_restrict_iteration = False + from pyopencl import wait_for_events while level: if debug: # More invariants: @@ -652,7 +661,7 @@ def fin_debug(s): + ((srcntgt_radii,) if srcntgts_have_extent else ()) ) - fin_debug("morton count scan") + debug_with_finish("morton count scan") morton_count_args = common_args if srcntgts_have_extent: @@ -660,11 +669,11 @@ def fin_debug(s): # writes: box_morton_bin_counts evt = knl_info.morton_count_scan( - *morton_count_args, queue=queue, size=nsrcntgts, + *morton_count_args, queue=actx.queue, size=nsrcntgts, wait_for=wait_for) wait_for = [evt] - fin_debug("split box id scan") + debug_with_finish("split box id scan") # writes: box_has_children, split_box_ids evt = knl_info.split_box_id_scan( @@ -684,7 +693,7 @@ def fin_debug(s): split_box_ids, have_oversize_split_box, - queue=queue, + queue=actx.queue, size=level_start_box_nrs[level], wait_for=wait_for) wait_for = [evt] @@ -698,7 +707,7 @@ def fin_debug(s): last_box_on_prev_level = level_start_box_id - 1 new_level_used_box_counts.append( # FIXME: Get this all at once. - int(split_box_ids[last_box_on_prev_level].get()) + int(actx.to_numpy(split_box_ids[last_box_on_prev_level])) - level_start_box_id) # New leaf count = @@ -743,7 +752,7 @@ def fin_debug(s): # have_oversize_split_box = 0), then we do not need to allocate any # extra space, since no new leaves can be created at the bottom # level. - if knl_info.level_restrict and have_oversize_split_box.get(): + if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box): # Currently undocumented. lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1) minimal_new_level_length += sum( @@ -792,18 +801,17 @@ def fin_debug(s): old_box_count = level_start_box_nrs[-1] # Where should I put this box? - dst_box_id = cl.array.empty(queue, - shape=old_box_count, dtype=box_id_dtype) + dst_box_id = actx.np.zeros(shape=old_box_count, dtype=box_id_dtype) for level_start, new_level_start, level_len in zip( level_start_box_nrs[:-1], new_level_start_box_nrs[:-1], curr_upper_level_lengths, strict=True): - dst_box_id[level_start:level_start + level_len] = \ - cl.array.arange(queue, - new_level_start, - new_level_start + level_len, - dtype=box_id_dtype) + dst_box_id[level_start:level_start+level_len] = actx.from_numpy( + np.arange(new_level_start, + new_level_start + level_len, + dtype=box_id_dtype) + ) wait_for.extend(dst_box_id.events) @@ -843,28 +851,27 @@ def fin_debug(s): # {{{ reallocate and/or renumber boxes if necessary if level_start_box_nrs_updated or nboxes_new > nboxes_guess: - fin_debug("starting nboxes_guess increase") + debug_with_finish("starting nboxes_guess increase") while nboxes_guess < nboxes_new: nboxes_guess *= 2 def my_realloc_nocopy(ary, shape=nboxes_guess): - return cl.array.empty(queue, allocator=allocator, - shape=shape, dtype=ary.dtype) + return actx.zeros(shape=shape, dtype=ary.dtype) def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): - result = cl.array.zeros(queue, allocator=allocator, - shape=shape, dtype=ary.dtype) + result = actx.zeros(shape=shape, dtype=ary.dtype) return result, result.events[0] - my_realloc = partial(realloc_array, - queue, allocator, nboxes_guess, wait_for=wait_for) - my_realloc_zeros = partial(realloc_array, - queue, allocator, nboxes_guess, zero_fill=True, - wait_for=wait_for) - my_realloc_zeros_and_renumber = partial(realloc_and_renumber_array, - queue, allocator, nboxes_guess, zero_fill=True, - wait_for=wait_for) + my_realloc = partial( + realloc_array, + actx, nboxes_guess, wait_for=wait_for) + my_realloc_zeros = partial( + realloc_array, + actx, nboxes_guess, zero_fill=True, wait_for=wait_for) + my_realloc_zeros_and_renumber = partial( + realloc_and_renumber_array, + actx, nboxes_guess, zero_fill=True, wait_for=wait_for) resize_events = [] @@ -875,8 +882,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): # only the box morton bin counts of boxes on the level # currently being processed are written-but we need to # retain the box morton bin counts from the higher levels. - box_morton_bin_counts, evt = my_realloc_zeros( - box_morton_bin_counts) + box_morton_bin_counts, evt = my_realloc_zeros(box_morton_bin_counts) resize_events.append(evt) # force_split_box is unused unless level restriction is enabled. @@ -911,7 +917,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): resize_events.append(evt) else: box_levels, evt = my_realloc_zeros_nocopy(box_levels) - cl.wait_for_events([evt]) + wait_for_events([evt]) for box_level, (level_start, level_end) in enumerate( pairwise(level_start_box_nrs)): box_levels[level_start:level_end].fill(box_level) @@ -977,9 +983,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): if level_nboxes == 0: assert leaf_count == 0 continue - nleaves_actual = level_nboxes - int( - cl.array.sum(box_has_children[ - level_start:level_start + level_nboxes]).get()) + nleaves_actual = level_nboxes - int(actx.to_numpy( + actx.np.sum( + box_has_children[level_start:level_start + level_nboxes] + ) + )) assert leaf_count == nleaves_actual # Can't del in Py2.7 - see note below @@ -1006,7 +1014,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): wait_for = [evt] - fin_debug("box splitter") + debug_with_finish("box splitter") # Mark the levels of boxes added for padding (these were not updated # by the box splitter kernel). @@ -1017,20 +1025,20 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): if debug: box_levels.finish() - level_bl_chunk = box_levels.get()[ + level_bl_chunk = actx.to_numpy(box_levels)[ level_start_box_nrs[-2]:level_start_box_nrs[-1]] - assert (level_bl_chunk == level).all() + assert np.all(level_bl_chunk == level) del level_bl_chunk if debug: - assert (box_srcntgt_starts.get() < nsrcntgts).all() + assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts) # }}} # {{{ renumber particles within split boxes - new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids) - new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids) + new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids) + new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids) particle_renumberer_args = ( *common_args, @@ -1044,7 +1052,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): wait_for = [evt] - fin_debug("particle renumbering") + debug_with_finish("particle renumbering") user_srcntgt_ids = new_user_srcntgt_ids del new_user_srcntgt_ids @@ -1066,7 +1074,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): # reallocation code. In order to fix this issue, the box # numbering and reallocation code needs to be accessible after # the final level restriction is done. - assert int(have_oversize_split_box.get()) == 0 + assert int(actx.to_numpy(have_oversize_split_box)) == 0 assert level_used_box_counts[-1] == 0 del level_used_box_counts[-1] del level_start_box_nrs[-1] @@ -1123,10 +1131,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): if debug: force_split_box.finish() - boxes_split.append(int(cl.array.sum( - force_split_box[upper_level_slice]).get())) + boxes_split.append(int(actx.to_numpy( + actx.np.sum(force_split_box[upper_level_slice]) + ))) - if int(have_upper_level_split_box.get()) == 0: + if int(actx.to_numpy(have_upper_level_split_box)) == 0: break did_upper_level_split = True @@ -1141,7 +1150,8 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): logger.debug("level %d: %d boxes split", level_, nboxes_split) del boxes_split - if int(have_oversize_split_box.get()) == 0 and did_upper_level_split: + if (int(actx.to_numpy(have_oversize_split_box)) == 0 + and did_upper_level_split): # We are in the situation where there are boxes left to # split on upper levels, and the level loop is done creating # lower levels. @@ -1154,7 +1164,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): # }}} - if not int(have_oversize_split_box.get()): + if not int(actx.to_numpy(have_oversize_split_box)): logger.debug("no boxes left to split") break @@ -1164,9 +1174,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): # {{{ check that nonchild part of box_morton_bin_counts is consistent if debug and 0: - h_box_morton_bin_counts = box_morton_bin_counts.get() - h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get() - h_box_child_ids = tuple(bci.get() for bci in box_child_ids) + h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts) + h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) + h_box_child_ids = tuple( + actx.to_numpy(bci) for bci in box_child_ids + ) has_mismatch = False for ibox in range(level_start_box_nrs[-1]): @@ -1213,8 +1225,8 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): # {{{ extract number of non-child srcntgts from box morton counts if srcntgts_have_extent: - box_srcntgt_counts_nonchild = empty(nboxes, particle_id_dtype) - fin_debug("extract non-child srcntgt count") + box_srcntgt_counts_nonchild = actx.np.zeros(nboxes, particle_id_dtype) + debug_with_finish("extract non-child srcntgt count") assert len(level_start_box_nrs) >= 2 highest_possibly_split_box_nr = level_start_box_nrs[-2] @@ -1234,11 +1246,13 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): del highest_possibly_split_box_nr if debug: - h_box_srcntgt_counts_nonchild = box_srcntgt_counts_nonchild.get() - h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get() + h_box_srcntgt_counts_nonchild = ( + actx.to_numpy(box_srcntgt_counts_nonchild)) + h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul) - assert (h_box_srcntgt_counts_nonchild - <= h_box_srcntgt_counts_cumul[:nboxes]).all() + assert np.all( + h_box_srcntgt_counts_nonchild + <= h_box_srcntgt_counts_cumul[:nboxes]) del h_box_srcntgt_counts_nonchild @@ -1256,7 +1270,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): if prune_empty_leaves: # What is the original index of this box? - src_box_id = empty(nboxes, box_id_dtype) + src_box_id = actx.np.zeros(nboxes, box_id_dtype) # Where should I put this box? # @@ -1265,37 +1279,39 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess): dst_box_id, evt = zeros(nboxes, box_id_dtype) wait_for.append(evt) - fin_debug("find prune indices") + debug_with_finish("find prune indices") - nboxes_post_prune_dev = empty((), dtype=box_id_dtype) + nboxes_post_prune_dev = actx.np.zeros((), dtype=box_id_dtype) evt = knl_info.find_prune_indices_kernel( box_srcntgt_counts_cumul, src_box_id, dst_box_id, nboxes_post_prune_dev, size=nboxes, wait_for=wait_for) wait_for = [evt] - nboxes_post_prune = int(nboxes_post_prune_dev.get()) + nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev)) logger.debug("%d boxes after pruning " - "(%d empty leaves and/or unused boxes removed)", - nboxes_post_prune, nboxes - nboxes_post_prune) + "(%d empty leaves and/or unused boxes removed)", + nboxes_post_prune, nboxes - nboxes_post_prune) should_prune = True elif knl_info.level_restrict: # Remove unused boxes from the tree. - src_box_id = empty(nboxes, box_id_dtype) - dst_box_id = empty(nboxes, box_id_dtype) + src_box_id = actx.np.zeros(nboxes, box_id_dtype) + dst_box_id = actx.np.zeros(nboxes, box_id_dtype) - new_level_start_box_nrs = np.empty_like(level_start_box_nrs) + new_level_start_box_nrs = np.zeros_like(level_start_box_nrs) new_level_start_box_nrs[0] = 0 new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts) for level_start, new_level_start, level_used_box_count in zip( level_start_box_nrs[:-1], new_level_start_box_nrs[:-1], level_used_box_counts, strict=True): + def make_slice(start, offset=level_used_box_count): return slice(start, start + offset) def make_arange(start, offset=level_used_box_count): - return cl.array.arange( - queue, start, start + offset, dtype=box_id_dtype) + return actx.from_numpy( + np.arange(start, start + offset, dtype=box_id_dtype) + ) src_box_id[make_slice(new_level_start)] = make_arange(level_start) dst_box_id[make_slice(level_start)] = make_arange(new_level_start) @@ -1313,7 +1329,7 @@ def make_arange(start, offset=level_used_box_count): prune_events = [] prune_empty = partial(self.gappy_copy_and_map, - queue, allocator, nboxes_post_prune, + actx, nboxes_post_prune, src_indices=src_box_id, range=slice(nboxes_post_prune), debug=debug) @@ -1324,7 +1340,7 @@ def make_arange(start, offset=level_used_box_count): prune_events.append(evt) if debug and prune_empty_leaves: - assert (box_srcntgt_counts_cumul.get() > 0).all() + assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0) srcntgt_box_ids, evt = self.map_values_kernel( dst_box_id, srcntgt_box_ids) @@ -1358,10 +1374,11 @@ def make_arange(start, offset=level_used_box_count): evt = knl_info.find_level_box_counts_kernel( box_levels, level_used_box_counts_dev) - cl.wait_for_events([evt]) + wait_for_events([evt]) nlevels = len(level_used_box_counts) - level_used_box_counts = level_used_box_counts_dev[:nlevels].get() + level_used_box_counts = ( + actx.to_numpy(level_used_box_counts_dev[:nlevels])) level_start_box_nrs = [0] level_start_box_nrs.extend(np.cumsum(level_used_box_counts)) @@ -1386,7 +1403,7 @@ def make_arange(start, offset=level_used_box_count): if targets is None: from boxtree.tools import reverse_index_array user_source_ids = user_srcntgt_ids - sorted_target_ids = reverse_index_array(user_srcntgt_ids) + sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids) box_source_starts = box_target_starts = box_srcntgt_starts box_source_counts_cumul = box_target_counts_cumul = \ @@ -1395,18 +1412,18 @@ def make_arange(start, offset=level_used_box_count): box_source_counts_nonchild = box_target_counts_nonchild = \ box_srcntgt_counts_nonchild else: - source_numbers = empty(nsrcntgts, particle_id_dtype) + source_numbers = actx.np.zeros(nsrcntgts, particle_id_dtype) - fin_debug("source counter") + debug_with_finish("source counter") evt = knl_info.source_counter(user_srcntgt_ids, nsources, - source_numbers, queue=queue, allocator=allocator, + source_numbers, queue=actx.queue, allocator=actx.allocator, wait_for=wait_for) wait_for = [evt] - user_source_ids = empty(nsources, particle_id_dtype) + user_source_ids = actx.np.zeros(nsources, particle_id_dtype) # srcntgt_target_ids is temporary until particle permutation is done - srcntgt_target_ids = empty(ntargets, particle_id_dtype) - sorted_target_ids = empty(ntargets, particle_id_dtype) + srcntgt_target_ids = actx.np.zeros(ntargets, particle_id_dtype) + sorted_target_ids = actx.np.zeros(ntargets, particle_id_dtype) # need to use zeros because parent boxes won't be initialized box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype) @@ -1429,7 +1446,7 @@ def make_arange(start, offset=level_used_box_count): nboxes_post_prune, particle_id_dtype) wait_for.append(evt) - fin_debug("source and target index finder") + debug_with_finish("source and target index finder") evt = knl_info.source_and_target_index_finder(*( # input: ( @@ -1453,31 +1470,32 @@ def make_arange(start, offset=level_used_box_count): box_target_counts_nonchild, # pylint: disable=possibly-used-before-assignment ) if srcntgts_have_extent else ()) ), - queue=queue, range=slice(nsrcntgts), + queue=actx.queue, range=slice(nsrcntgts), wait_for=wait_for) wait_for = [evt] if srcntgts_have_extent: # noqa: SIM102 if debug: - assert ( - box_srcntgt_counts_nonchild.get() - == (box_source_counts_nonchild - + box_target_counts_nonchild).get()).all() + assert np.all(actx.to_numpy( + box_srcntgt_counts_nonchild + == (box_source_counts_nonchild + box_target_counts_nonchild) + )) if debug: - usi_host = user_source_ids.get() - assert (usi_host < nsources).all() - assert (usi_host >= 0).all() + usi_host = actx.to_numpy(user_source_ids) + assert np.all(usi_host < nsources) + assert np.all(usi_host >= 0) del usi_host - sti_host = srcntgt_target_ids.get() - assert (sti_host < nsources+ntargets).all() - assert (nsources <= sti_host).all() + sti_host = actx.to_numpy(srcntgt_target_ids) + assert np.all(sti_host < nsources+ntargets) + assert np.all(nsources <= sti_host) del sti_host - assert (box_source_counts_cumul.get() - + box_target_counts_cumul.get() - == box_srcntgt_counts_cumul.get()).all() + assert np.all(actx.to_numpy( + box_source_counts_cumul + box_target_counts_cumul + == box_srcntgt_counts_cumul + )) del source_numbers @@ -1490,10 +1508,9 @@ def make_arange(start, offset=level_used_box_count): # {{{ permute and source/target-split (if necessary) particle array if targets is None: - sources = targets = make_obj_array([ - cl.array.empty_like(pt) for pt in srcntgts]) + sources = targets = actx.np.zeros_like(srcntgts) - fin_debug("srcntgt permuter (particles)") + debug_with_finish("srcntgt permuter (particles)") evt = knl_info.srcntgt_permuter( user_srcntgt_ids, *(tuple(srcntgts) + tuple(sources)), @@ -1504,34 +1521,37 @@ def make_arange(start, offset=level_used_box_count): else: sources = make_obj_array([ - empty(nsources, coord_dtype) for i in range(dimensions)]) - fin_debug("srcntgt permuter (sources)") + actx.np.zeros(nsources, coord_dtype) for i in range(dimensions) + ]) + debug_with_finish("srcntgt permuter (sources)") evt = knl_info.srcntgt_permuter( user_source_ids, *(tuple(srcntgts) + tuple(sources)), - queue=queue, range=slice(nsources), + queue=actx.queue, range=slice(nsources), wait_for=wait_for) wait_for = [evt] targets = make_obj_array([ - empty(ntargets, coord_dtype) for i in range(dimensions)]) - fin_debug("srcntgt permuter (targets)") + actx.np.zeros(ntargets, coord_dtype) for i in range(dimensions) + ]) + debug_with_finish("srcntgt permuter (targets)") evt = knl_info.srcntgt_permuter( srcntgt_target_ids, *(tuple(srcntgts) + tuple(targets)), - queue=queue, range=slice(ntargets), + queue=actx.queue, range=slice(ntargets), wait_for=wait_for) wait_for = [evt] if srcntgt_radii is not None: - fin_debug("srcntgt permuter (source radii)") - source_radii = cl.array.take( - srcntgt_radii, user_source_ids, queue=queue, + import pyopencl.array as cl_array + debug_with_finish("srcntgt permuter (source radii)") + source_radii = cl_array.take( + srcntgt_radii, user_source_ids, queue=actx.queue, wait_for=wait_for) - fin_debug("srcntgt permuter (target radii)") - target_radii = cl.array.take( - srcntgt_radii, srcntgt_target_ids, queue=queue, + debug_with_finish("srcntgt permuter (target radii)") + target_radii = cl_array.take( + srcntgt_radii, srcntgt_target_ids, queue=actx.queue, wait_for=wait_for) wait_for = source_radii.events + target_radii.events @@ -1549,7 +1569,7 @@ def make_arange(start, offset=level_used_box_count): assert nlevels == len(level_used_box_counts) assert level + 1 == nlevels, (level+1, nlevels) if debug: - max_level = np.max(box_levels.get()) + max_level = np.max(actx.to_numpy(box_levels)) assert max_level + 1 == nlevels # {{{ gather box child ids, box centers @@ -1561,7 +1581,7 @@ def make_arange(start, offset=level_used_box_count): box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype) wait_for.append(evt) - box_centers_new = empty((dimensions, aligned_nboxes), coord_dtype) + box_centers_new = actx.np.zeros((dimensions, aligned_nboxes), coord_dtype) for mnr, child_row in enumerate(box_child_ids): box_child_ids_new[mnr, :nboxes_post_prune] = \ @@ -1572,7 +1592,7 @@ def make_arange(start, offset=level_used_box_count): box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune] wait_for.extend(box_centers_new.events) - cl.wait_for_events(wait_for) + wait_for_events(wait_for) box_centers = box_centers_new box_child_ids = box_child_ids_new @@ -1585,7 +1605,7 @@ def make_arange(start, offset=level_used_box_count): # {{{ compute box flags from boxtree.tree import box_flags_enum - box_flags = empty(nboxes_post_prune, box_flags_enum.dtype) + box_flags = actx.np.zeros(nboxes_post_prune, box_flags_enum.dtype) if not srcntgts_have_extent: # If srcntgts_have_extent, then non-child counts have already been @@ -1624,7 +1644,7 @@ def make_arange(start, offset=level_used_box_count): nboxes_post_prune, particle_id_dtype) wait_for.append(evt) - fin_debug("compute box info") + debug_with_finish("compute box info") evt = knl_info.box_info_kernel( *( # input: @@ -1648,27 +1668,23 @@ def make_arange(start, offset=level_used_box_count): # {{{ compute box bounding box - fin_debug("finding box extents") + debug_with_finish("finding box extents") - box_source_bounding_box_min = cl.array.empty( - queue, (dimensions, aligned_nboxes), - dtype=coord_dtype) - box_source_bounding_box_max = cl.array.empty( - queue, (dimensions, aligned_nboxes), - dtype=coord_dtype) + box_source_bounding_box_min = actx.np.zeros( + (dimensions, aligned_nboxes), dtype=coord_dtype) + box_source_bounding_box_max = actx.np.zeros( + (dimensions, aligned_nboxes), dtype=coord_dtype) if sources_are_targets: box_target_bounding_box_min = box_source_bounding_box_min box_target_bounding_box_max = box_source_bounding_box_max else: - box_target_bounding_box_min = cl.array.empty( - queue, (dimensions, aligned_nboxes), - dtype=coord_dtype) - box_target_bounding_box_max = cl.array.empty( - queue, (dimensions, aligned_nboxes), - dtype=coord_dtype) + box_target_bounding_box_min = actx.np.zeros( + (dimensions, aligned_nboxes), dtype=coord_dtype) + box_target_bounding_box_max = actx.np.zeros( + (dimensions, aligned_nboxes), dtype=coord_dtype) - bogus_radii_array = cl.array.empty(queue, 1, dtype=coord_dtype) + bogus_radii_array = actx.np.zeros(1, dtype=coord_dtype) # nlevels-1 is the highest valid level index for level in range(nlevels-1, -1, -1): @@ -1720,7 +1736,7 @@ def make_arange(start, offset=level_used_box_count): *args, range=slice(start, stop), - queue=queue, wait_for=wait_for) + queue=actx.queue, wait_for=wait_for) wait_for = [evt] @@ -1734,8 +1750,13 @@ def make_arange(start, offset=level_used_box_count): if sources_have_extent: extra_tree_attrs.update(source_radii=source_radii) + else: + extra_tree_attrs.update(source_radii=None) + if targets_have_extent: extra_tree_attrs.update(target_radii=target_radii) + else: + extra_tree_attrs.update(target_radii=None) tree_build_proc.done( "%d levels, %d boxes, %d particles, box extent norm: %s, " @@ -1743,7 +1764,7 @@ def make_arange(start, offset=level_used_box_count): nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm, max_leaf_refine_weight) - return Tree( + tree = Tree( # If you change this, also change the documentation # of what's in the tree, above. sources_are_targets=sources_are_targets, @@ -1755,13 +1776,12 @@ def make_arange(start, offset=level_used_box_count): coord_dtype=coord_dtype, box_level_dtype=self.box_level_dtype, + bounding_box=(bbox_min, bbox_max), root_extent=root_extent, stick_out_factor=stick_out_factor, extent_norm=srcntgts_extent_norm, - bounding_box=(bbox_min, bbox_max), - level_start_box_nrs=level_start_box_nrs, - level_start_box_nrs_dev=level_start_box_nrs_dev, + level_start_box_nrs=actx.from_numpy(level_start_box_nrs), sources=sources, targets=targets, @@ -1790,7 +1810,9 @@ def make_arange(start, offset=level_used_box_count): _is_pruned=prune_empty_leaves, **extra_tree_attrs - ).with_queue(None), evt + ) + + return actx.freeze(tree), evt # }}} From 5d7027d337b0ef7b4b818efd0879957a9ac1b142 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 22 Jun 2022 15:20:55 +0300 Subject: [PATCH 11/28] port tree_build_kernels to arraycontext --- boxtree/tree_build_kernels.py | 47 +++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py index 3477fa32..5624c690 100644 --- a/boxtree/tree_build_kernels.py +++ b/boxtree/tree_build_kernels.py @@ -21,20 +21,24 @@ """ import logging +from dataclasses import dataclass from functools import partial import numpy as np from mako.template import Template -from pyopencl.elementwise import ElementwiseTemplate -from pyopencl.scan import ScanTemplate -from pytools import Record, log_process, memoize +from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate +from pyopencl.scan import GenericScanKernel, ScanTemplate +from pytools import log_process, memoize from boxtree.tools import ( + ScalarArg, + VectorArg, coord_vec_subscript_code, get_coord_vec_dtype, get_type_moniker, ) +from boxtree.traversal import HELPER_FUNCTION_TEMPLATE, TRAVERSAL_PREAMBLE_MAKO_DEFS logger = logging.getLogger(__name__) @@ -121,8 +125,27 @@ # ----------------------------------------------------------------------------- -class _KernelInfo(Record): - pass +@dataclass(frozen=True) +class _KernelInfo: + particle_id_dtype: np.dtype + box_id_dtype: np.dtype + morton_bin_count_dtype: np.dtype + + morton_count_scan: GenericScanKernel + split_box_id_scan: GenericScanKernel + box_splitter_kernel: ElementwiseKernel + particle_renumberer_kernel: ElementwiseKernel + level_restrict: bool + level_restrict_kernel_builder: ElementwiseKernel | None + + extract_nonchild_srcntgt_count_kernel: ElementwiseKernel | None + find_prune_indices_kernel: GenericScanKernel + find_level_box_counts_kernel: GenericScanKernel + srcntgt_permuter: ElementwiseKernel + source_counter: GenericScanKernel + source_and_target_index_finder: ElementwiseKernel | None + box_info_kernel: ElementwiseKernel + box_extents_finder_kernel: ElementwiseKernel # {{{ data types @@ -797,9 +820,6 @@ def get_count_for_branch(known_bits): # {{{ level restrict kernel -from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS - - LEVEL_RESTRICT_TPL = Template( TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako// <%def name="my_load_center(name, box_id)"> @@ -933,8 +953,6 @@ def build_level_restrict_kernel(context, preamble_with_dtype_decls, from pyopencl.elementwise import ElementwiseKernel - from boxtree.traversal import HELPER_FUNCTION_TEMPLATE - return ElementwiseKernel( context, arguments=arguments, @@ -1400,7 +1418,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, if np.iinfo(box_id_dtype).min == 0: from warnings import warn warn("Careful with unsigned types for box_id_dtype. Some CL implementations " - "(notably Intel 2012) mis-implemnet unsigned operations, leading to " + "(notably Intel 2012) mis-implement unsigned operations, leading to " "incorrect results.", stacklevel=4) from pyopencl.tools import dtype_to_c_struct, dtype_to_ctype @@ -1471,7 +1489,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, + str(MORTON_NR_SCAN_PREAMBLE_TPL.render(**codegen_args)) ) - from boxtree.tools import ScalarArg, VectorArg common_arguments = ( [ # box-local morton bin counts for each particle at the current level @@ -1533,7 +1550,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, (ScalarArg(coord_dtype, "stick_out_factor")) ] - from pyopencl.scan import GenericScanKernel morton_count_scan = GenericScanKernel( context, morton_bin_count_dtype, arguments=morton_count_scan_arguments, @@ -1557,7 +1573,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # {{{ split_box_id scan - from pyopencl.scan import GenericScanKernel split_box_id_scan = SPLIT_BOX_ID_SCAN_TPL.build( context, type_aliases=( @@ -1592,7 +1607,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, box_splitter_kernel_source = BOX_SPLITTER_KERNEL_TPL.render(**box_s_codegen_args) - from pyopencl.elementwise import ElementwiseKernel box_splitter_kernel = ElementwiseKernel( context, common_arguments @@ -1627,7 +1641,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, particle_renumberer_kernel_source = \ PARTICLE_RENUMBERER_KERNEL_TPL.render(**codegen_args) - from pyopencl.elementwise import ElementwiseKernel particle_renumberer_kernel = ElementwiseKernel( context, [*common_arguments, @@ -1679,7 +1692,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # FIXME: Turn me into a scan template - from boxtree.tools import VectorArg find_prune_indices_kernel = GenericScanKernel( context, box_id_dtype, arguments=[ @@ -1753,7 +1765,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype, # really a loss. # FIXME: make me a scan template - from pyopencl.scan import GenericScanKernel source_counter = GenericScanKernel( context, box_id_dtype, arguments=[ From c8bd1a2c7543ea3471c7e41373cc8043d72b4bf0 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Thu, 23 Jun 2022 17:18:30 +0300 Subject: [PATCH 12/28] port pyfmmlib_integration to arraycontext --- boxtree/pyfmmlib_integration.py | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py index 07a54b74..869805b6 100644 --- a/boxtree/pyfmmlib_integration.py +++ b/boxtree/pyfmmlib_integration.py @@ -35,21 +35,21 @@ THE SOFTWARE. """ - -import logging - - -logger = logging.getLogger(__name__) import enum +import logging import numpy as np from pytools import log_process, memoize_method +from boxtree.array_context import PyOpenCLArrayContext from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler from boxtree.timing import return_timing_data +logger = logging.getLogger(__name__) + + # {{{ rotation data interface class FMMLibRotationDataInterface: @@ -80,8 +80,8 @@ class FMMLibRotationData(FMMLibRotationDataInterface): .. automethod:: __init__ """ - def __init__(self, queue, trav): - self.queue = queue + def __init__(self, array_context: PyOpenCLArrayContext, trav): + self._setup_actx = array_context self.trav = trav self.tree = trav.tree @@ -89,27 +89,27 @@ def __init__(self, queue, trav): @memoize_method def rotation_classes_builder(self): from boxtree.rotation_classes import RotationClassesBuilder - return RotationClassesBuilder(self.queue.context) + return RotationClassesBuilder(self._setup_actx) @memoize_method def build_rotation_classes_lists(self): - trav = self.trav.to_device(self.queue) - tree = self.tree.to_device(self.queue) - return self.rotation_classes_builder(self.queue, trav, tree)[0] + trav = self._setup_actx.from_numpy(self.trav) + tree = self._setup_actx.from_numpy(self.tree) + return self.rotation_classes_builder(self._setup_actx, trav, tree)[0] @memoize_method def m2l_rotation_lists(self): - return (self - .build_rotation_classes_lists() - .from_sep_siblings_rotation_classes - .get(self.queue)) + return self._setup_actx.to_numpy( + self.build_rotation_classes_lists() + .from_sep_siblings_rotation_classes, + ) @memoize_method def m2l_rotation_angles(self): - return (self - .build_rotation_classes_lists() - .from_sep_siblings_rotation_class_to_angle - .get(self.queue)) + return self._setup_actx.to_numpy( + self.build_rotation_classes_lists() + .from_sep_siblings_rotation_class_to_angle, + ) class FMMLibRotationDataNotSuppliedWarning(UserWarning): From 34cd81c2c81b110540cf425af6ecf0f27bd880fc Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Thu, 23 Jun 2022 21:29:17 +0300 Subject: [PATCH 13/28] port cost to arraycontext --- boxtree/cost.py | 473 +++++++++++++++++++++++------------------------- 1 file changed, 228 insertions(+), 245 deletions(-) diff --git a/boxtree/cost.py b/boxtree/cost.py index 7d2cc952..3a991a12 100644 --- a/boxtree/cost.py +++ b/boxtree/cost.py @@ -58,6 +58,7 @@ .. autoclass:: FMMCostModel """ +from abc import ABC, abstractmethod from collections.abc import Mapping from functools import partial from typing import ClassVar @@ -65,17 +66,15 @@ import numpy as np from mako.template import Template -import pyopencl as cl -import pyopencl.array from pymbolic import evaluate, var from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype -from pytools import memoize_method +from pytools import keyed_memoize_method +from boxtree.array_context import PyOpenCLArrayContext -Template = partial(Template, strict_undefined=True) -from abc import ABC, abstractmethod +Template = partial(Template, strict_undefined=True) # {{{ FMMTranslationCostModel @@ -218,6 +217,7 @@ class AbstractFMMCostModel(ABC): .. automethod:: get_ndirect_sources_per_target_box """ + def __init__( self, translation_cost_model_factory=make_pde_aware_translation_cost_model): @@ -229,28 +229,27 @@ def __init__( self.translation_cost_model_factory = translation_cost_model_factory @abstractmethod - def process_form_multipoles(self, queue, traversal, p2m_cost): + def process_form_multipoles(self, actx: PyOpenCLArrayContext, + traversal, p2m_cost): """Cost for forming multipole expansions of each box. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels,) representing the cost of forming the multipole - expansion of one source at each level. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (nsource_boxes,), with each entry represents the cost of the box. + :arg p2m_cost: an array of shape (nlevels,) representing the cost of + forming the multipole expansion of one source at each level. + :return: an array of shape (nsource_boxes,), with each entry represents + the cost of the box. """ pass @abstractmethod - def process_coarsen_multipoles(self, queue, traversal, m2m_cost): + def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, + traversal, m2m_cost): """Cost for upward propagation. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels-1,), where the ith entry represents the - multipole-to-multipole cost from source level i+1 to target level i. + :arg m2m_cost: an array of shape (nlevels-1,), where the ith entry + represents the multipole-to-multipole cost from source level i+1 + to target level i. :return: a :class:`float`, the overall cost of upward propagation. .. note:: This method returns a number instead of an array, because it is not @@ -260,118 +259,106 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost): pass @abstractmethod - def get_ndirect_sources_per_target_box(self, queue, traversal): + def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, + traversal): """Collect the number of direct evaluation sources (list 1, list 3 close and list 4 close) for each target box. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (ntarget_boxes,), with each entry representing the number of direct - evaluation sources for that target box. + :return: an array of shape (ntarget_boxes,), with each entry representing + the number of direct evaluation sources for that target box. """ pass @abstractmethod - def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, actx: PyOpenCLArrayContext, + traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): """Direct evaluation cost of each target box of *traversal*. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry - representing the number of direct evaluation sources for that target box. + :arg ndirect_sources_by_itgt_box: an array of shape (ntarget_boxes,), + with each entry representing the number of direct evaluation sources + for that target box. :arg p2p_cost: a constant representing the cost of one point-to-point evaluation. - :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets - using direct evaluation in this box. For example, this is useful in QBX - by specifying the number of non-QBX targets. If None, all targets in - boxes are considered. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (ntarget_boxes,), with each entry represents the cost of the box. + :arg box_target_counts_nonchild: an array of shape (nboxes,), the + number of targets using direct evaluation in this box. For example, + this is useful in QBX by specifying the number of non-QBX targets. + If None, all targets in boxes are considered. + :return: an array of shape (ntarget_boxes,), with each entry represents + the cost of the box. """ pass @abstractmethod - def process_list2(self, queue, traversal, m2l_cost): + def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost): """ - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels,) representing the translation cost of each level. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (ntarget_or_target_parent_boxes,), with each entry representing the cost - of multipole-to-local translations to this box. + :arg m2l_cost: an array of shape (nlevels,) representing the + translation cost of each level. + :return: an array of shape (ntarget_or_target_parent_boxes,), with + each entry representing the cost of multipole-to-local + translations to this box. """ pass @abstractmethod - def process_list3(self, queue, traversal, m2p_cost, + def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, box_target_counts_nonchild=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels,) where the ith entry represents the evaluation cost - from multipole expansion at level i to a point. - :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets - using multiple-to-point translations in this box. For example, this is - useful in QBX by specifying the number of non-QBX targets. If None, all - targets in boxes are considered. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (nboxes,), with each entry representing the cost of evaluating all - targets inside this box from multipole expansions of list-3 boxes. + :arg m2p_cost: an array of shape (nlevels,) where the ith entry + represents the evaluation cost from multipole expansion at level i + to a point. + :arg box_target_counts_nonchild: an array of shape (nboxes,), the + number of targets using multiple-to-point translations in this box. + For example, this is useful in QBX by specifying the number of + non-QBX targets. If None, all targets in boxes are considered. + :return: an array of shape (nboxes,), with each entry representing the + cost of evaluating all targets inside this box from multipole + expansions of list-3 boxes. """ pass @abstractmethod - def process_list4(self, queue, traversal, p2l_cost): + def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost): """ - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels,) where the ith entry represents the translation cost - from a point to the local expansion at level i. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (ntarget_or_target_parent_boxes,), with each entry representing the cost - of point-to-local translations to this box. + :arg p2l_cost: an array of shape (nlevels,) where the ith entry + represents the translation cost from a point to the local expansion + at level i. + :return: an array of shape (ntarget_or_target_parent_boxes,), with + each entry representing the cost of point-to-local translations to + this box. """ pass @abstractmethod - def process_eval_locals(self, queue, traversal, l2p_cost, + def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, box_target_counts_nonchild=None): """ - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape (nlevels,) where the ith entry represents the cost of evaluating - the potential of a target in a box of level i using the box's local - expansion. - :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets - which need evaluation. For example, this is useful in QBX by specifying - the number of non-QBX targets. If None, use + :arg l2p_cost: an array of shape (nlevels,) where the ith entry + represents the cost of evaluating the potential of a target in a + box of level i using the box's local expansion. + :arg box_target_counts_nonchild: an array of shape (nboxes,), the number + of targets which need evaluation. For example, this is useful in + QBX by specifying the number of non-QBX targets. If None, use traversal.tree.box_target_counts_nonchild. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (ntarget_boxes,), the cost of evaluating the potentials of all targets - inside this box from its local expansion. + :return: an array of shape (ntarget_boxes,), the cost of evaluating the + potentials of all targets inside this box from its local expansion. """ pass @abstractmethod - def process_refine_locals(self, queue, traversal, l2l_cost): + def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost): """Cost of downward propagation. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. - :arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` - of shape ``(nlevels-1,)``, where the :math:`i`th entry represents - the cost of translating local expansion from level :math:`i` to - level :math:`i+1`. + :arg l2l_cost: an array of shape ``(nlevels-1,)``, where the :math:`i`th + entry represents the cost of translating local expansion from level + :math:`i` to level :math:`i+1`. :return: a :class:`float`, the overall cost of downward propagation. .. note:: This method returns a number instead of an array, because it is not @@ -381,36 +368,34 @@ def process_refine_locals(self, queue, traversal, l2l_cost): pass @abstractmethod - def aggregate_over_boxes(self, per_box_result): + def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result): """Sum all entries of *per_box_result* into a number. - :arg per_box_result: an object of :class:`numpy.ndarray` or - :class:`pyopencl.array.Array`, the result to be sumed. + :arg per_box_result: an array to be sumed. :return: a :class:`float`, the result of the sum. """ pass @staticmethod - def cost_factors_to_dev(cost_factors, queue): + def cost_factors_to_dev(cost_factors, actx: PyOpenCLArrayContext | None): cost_factors_dev = {} for name in cost_factors: if not isinstance(cost_factors[name], np.ndarray): cost_factors_dev[name] = cost_factors[name] continue - cost_factors_dev[name] = cl.array.to_device( - queue, cost_factors[name] - ).with_queue(None) + + cost_factors_dev[name] = actx.freeze(actx.from_numpy(cost_factors[name])) return cost_factors_dev def fmm_cost_factors_for_kernels_from_model( - self, queue, nlevels, xlat_cost, context): + self, actx: PyOpenCLArrayContext | None, nlevels, xlat_cost, context): """Evaluate translation cost factors from symbolic model. The result of this function can be used for process_* methods in this class. - :arg queue: If not None, the cost factor arrays will be transferred to device - using this queue. + :arg actx: If not None, the cost factor arrays will be converted to + they array context's array type. :arg nlevels: the number of tree levels. :arg xlat_cost: a :class:`FMMTranslationCostModel`. :arg context: a :class:`dict` of parameters passed as context when @@ -449,29 +434,26 @@ def fmm_cost_factors_for_kernels_from_model( ], dtype=np.float64) } - if queue: - cost_factors = self.cost_factors_to_dev(cost_factors, queue) + if actx: + cost_factors = self.cost_factors_to_dev(cost_factors, actx) return cost_factors @abstractmethod - def zero_cost_per_box(self, queue, nboxes): + def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes): """Helper function for returning the per-box cost filled with 0. - :arg queue: a :class:`pyopencl.CommandQueue` object. :param nboxes: the number of boxes - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (*nboxes*,), representing the zero per-box cost. + :return: an array of shape (*nboxes*,), representing the zero per-box cost. """ pass - def cost_per_box(self, queue, traversal, level_to_order, + def cost_per_box(self, actx: PyOpenCLArrayContext, traversal, level_to_order, calibration_params, ndirect_sources_per_target_box=None, box_target_counts_nonchild=None): """Predict the per-box costs of a new traversal object. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg level_to_order: a :class:`numpy.ndarray` of shape (traversal.tree.nlevels,) representing the expansion orders @@ -479,24 +461,21 @@ def cost_per_box(self, queue, traversal, level_to_order, :arg calibration_params: a :class:`dict` of calibration parameters. These parameters can be obtained via :meth:`estimate_calibration_params` or :meth:`get_unit_calibration_params`. - :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of - direct evaluation sources (list 1, list 3 close, list 4 close) for each - target box. You may find :meth:`get_ndirect_sources_per_target_box` - helpful. This argument is useful because the same result can be reused - for p2p, p2qbxl and tsqbx. - :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets - which need evaluation. For example, this is useful in QBX by specifying - the number of non-QBX targets. If None, all targets are considered, - namely traversal.tree.box_target_counts_nonchild. - :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape - (nboxes,), where the ith entry represents the cost of all stages for box - i. + :arg ndirect_sources_per_target_box: an array of shape (ntarget_boxes,), + the number of direct evaluation sources (list 1, list 3 close, list + 4 close) for each target box. You may find + :meth:`get_ndirect_sources_per_target_box` helpful. This argument is + useful because the same result can be reused for p2p, p2qbxl and tsqbx. + :arg box_target_counts_nonchild: an array of shape (nboxes,), the number + of targets which need evaluation. For example, this is useful in + QBX by specifying the number of non-QBX targets. If None, all + targets are considered, namely traversal.tree.box_target_counts_nonchild. + :return: an array of shape (nboxes,), where the ith entry represents + the cost of all stages for box i. """ if ndirect_sources_per_target_box is None: ndirect_sources_per_target_box = ( - self.get_ndirect_sources_per_target_box(queue, traversal) + self.get_ndirect_sources_per_target_box(actx, traversal) ) tree = traversal.tree @@ -505,7 +484,7 @@ def cost_per_box(self, queue, traversal, level_to_order, target_boxes = traversal.target_boxes target_or_target_parent_boxes = traversal.target_or_target_parent_boxes - result = self.zero_cost_per_box(queue, nboxes) + result = self.zero_cost_per_box(actx, nboxes) for ilevel in range(tree.nlevels): calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel] @@ -515,49 +494,48 @@ def cost_per_box(self, queue, traversal, level_to_order, ) translation_cost = self.fmm_cost_factors_for_kernels_from_model( - queue, tree.nlevels, xlat_cost, calibration_params + actx, tree.nlevels, xlat_cost, calibration_params ) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild result[source_boxes] += self.process_form_multipoles( - queue, traversal, translation_cost["p2m_cost"] + actx, traversal, translation_cost["p2m_cost"] ) result[target_boxes] += self.process_direct( - queue, traversal, ndirect_sources_per_target_box, + actx, traversal, ndirect_sources_per_target_box, translation_cost["c_p2p"], box_target_counts_nonchild=box_target_counts_nonchild ) result[target_or_target_parent_boxes] += self.process_list2( - queue, traversal, translation_cost["m2l_cost"] + actx, traversal, translation_cost["m2l_cost"] ) result += self.process_list3( - queue, traversal, translation_cost["m2p_cost"], + actx, traversal, translation_cost["m2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) result[target_or_target_parent_boxes] += self.process_list4( - queue, traversal, translation_cost["p2l_cost"] + actx, traversal, translation_cost["p2l_cost"] ) result[target_boxes] += self.process_eval_locals( - queue, traversal, translation_cost["l2p_cost"], + actx, traversal, translation_cost["l2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) return result - def cost_per_stage(self, queue, traversal, level_to_order, + def cost_per_stage(self, actx: PyOpenCLArrayContext, traversal, level_to_order, calibration_params, ndirect_sources_per_target_box=None, box_target_counts_nonchild=None): """Predict the per-stage costs of a new traversal object. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object. :arg level_to_order: a :class:`numpy.ndarray` of shape (traversal.tree.nlevels,) representing the expansion orders @@ -565,22 +543,21 @@ def cost_per_stage(self, queue, traversal, level_to_order, :arg calibration_params: a :class:`dict` of calibration parameters. These parameters can be obtained via :meth:`estimate_calibration_params` or :meth:`get_unit_calibration_params`. - :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of - direct evaluation sources (list 1, list 3 close, list 4 close) for each - target box. You may find :func:`get_ndirect_sources_per_target_box` - helpful. This argument is useful because the same result can be reused - for p2p, p2qbxl and tsqbx. - :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or - :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets - which need evaluation. For example, this is useful in QBX by specifying - the number of non-QBX targets. If None, all targets are considered, - namely traversal.tree.box_target_counts_nonchild. + :arg ndirect_sources_per_target_box: an array of shape (ntarget_boxes,), + the number of direct evaluation sources (list 1, list 3 close, list + 4 close) for each target box. You may find + :func:`get_ndirect_sources_per_target_box` helpful. This argument + is useful because the same result can be reused for p2p, p2qbxl and + tsqbx. + :arg box_target_counts_nonchild: an array of shape (nboxes,), the + number of targets which need evaluation. For example, this is useful + in QBX by specifying the number of non-QBX targets. If None, all + targets are considered, namely traversal.tree.box_target_counts_nonchild. :return: a :class:`dict`, mapping FMM stage names to cost numbers. """ if ndirect_sources_per_target_box is None: ndirect_sources_per_target_box = ( - self.get_ndirect_sources_per_target_box(queue, traversal) + self.get_ndirect_sources_per_target_box(actx, traversal) ) tree = traversal.tree @@ -594,52 +571,58 @@ def cost_per_stage(self, queue, traversal, level_to_order, ) translation_cost = self.fmm_cost_factors_for_kernels_from_model( - queue, tree.nlevels, xlat_cost, calibration_params + actx, tree.nlevels, xlat_cost, calibration_params ) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild result["form_multipoles"] = self.aggregate_over_boxes( + actx, self.process_form_multipoles( - queue, traversal, translation_cost["p2m_cost"] + actx, traversal, translation_cost["p2m_cost"] ) ) result["coarsen_multipoles"] = self.process_coarsen_multipoles( - queue, traversal, translation_cost["m2m_cost"] + actx, traversal, translation_cost["m2m_cost"] ) result["eval_direct"] = self.aggregate_over_boxes( + actx, self.process_direct( - queue, traversal, ndirect_sources_per_target_box, + actx, traversal, ndirect_sources_per_target_box, translation_cost["c_p2p"], box_target_counts_nonchild=box_target_counts_nonchild ) ) result["multipole_to_local"] = self.aggregate_over_boxes( - self.process_list2(queue, traversal, translation_cost["m2l_cost"]) + actx, + self.process_list2(actx, traversal, translation_cost["m2l_cost"]) ) result["eval_multipoles"] = self.aggregate_over_boxes( + actx, self.process_list3( - queue, traversal, translation_cost["m2p_cost"], + actx, traversal, translation_cost["m2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) ) result["form_locals"] = self.aggregate_over_boxes( - self.process_list4(queue, traversal, translation_cost["p2l_cost"]) + actx, + self.process_list4(actx, traversal, translation_cost["p2l_cost"]) ) result["refine_locals"] = self.process_refine_locals( - queue, traversal, translation_cost["l2l_cost"] + actx, traversal, translation_cost["l2l_cost"] ) result["eval_locals"] = self.aggregate_over_boxes( + actx, self.process_eval_locals( - queue, traversal, translation_cost["l2p_cost"], + actx, traversal, translation_cost["l2p_cost"], box_target_counts_nonchild=box_target_counts_nonchild ) ) @@ -744,11 +727,12 @@ class FMMCostModel(AbstractFMMCostModel): # {{{ form multipoles - @memoize_method - def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype, + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_form_multipoles_knl(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - context, + actx.context, Template(r""" double *np2m, ${box_id_t} *source_boxes, @@ -773,13 +757,12 @@ def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype, name="process_form_multipoles" ) - def process_form_multipoles(self, queue, traversal, p2m_cost): + def process_form_multipoles(self, actx, traversal, p2m_cost): tree = traversal.tree - np2m = cl.array.zeros(queue, len(traversal.source_boxes), dtype=np.float64) + np2m = actx.zeros(len(traversal.source_boxes), dtype=np.float64) process_form_multipoles_knl = self.process_form_multipoles_knl( - queue.context, - tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype + actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) process_form_multipoles_knl( @@ -787,7 +770,8 @@ def process_form_multipoles(self, queue, traversal, p2m_cost): traversal.source_boxes, tree.box_source_counts_nonchild, tree.box_levels, - p2m_cost + p2m_cost, + queue=actx.queue, ) return np2m @@ -796,11 +780,12 @@ def process_form_multipoles(self, queue, traversal, p2m_cost): # {{{ propagate multipoles upward - @memoize_method - def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype, + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_coarsen_multipoles_knl(self, actx: PyOpenCLArrayContext, + ndimensions, box_id_dtype, box_level_dtype, nlevels): return ElementwiseKernel( - context, + actx.context, Template(r""" ${box_id_t} *source_parent_boxes, ${box_level_t} *box_levels, @@ -840,14 +825,13 @@ def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype, name="process_coarsen_multipoles" ) - def process_coarsen_multipoles(self, queue, traversal, m2m_cost): + def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, + traversal, m2m_cost): tree = traversal.tree - nm2m = cl.array.zeros( - queue, len(traversal.source_parent_boxes), dtype=np.float64 - ) + nm2m = actx.zeros(len(traversal.source_parent_boxes), dtype=np.float64) process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl( - queue.context, + actx, tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels ) @@ -857,19 +841,20 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost): m2m_cost, nm2m, *tree.box_child_ids, - queue=queue + queue=actx.queue ) - return self.aggregate_over_boxes(nm2m) + return self.aggregate_over_boxes(actx, nm2m) # }}} # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close) - @memoize_method - def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype): + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def _get_ndirect_sources_knl(self, actx: PyOpenCLArrayContext, + particle_id_dtype, box_id_dtype): return ElementwiseKernel( - context, + actx.context, Template(""" ${particle_id_t} *ndirect_sources_by_itgt_box, ${box_id_t} *source_boxes_starts, @@ -902,18 +887,19 @@ def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype): name="get_ndirect_sources" ) - def get_ndirect_sources_per_target_box(self, queue, traversal): + def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, + traversal): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) particle_id_dtype = tree.particle_id_dtype box_id_dtype = tree.box_id_dtype get_ndirect_sources_knl = self._get_ndirect_sources_knl( - queue.context, particle_id_dtype, box_id_dtype + actx, particle_id_dtype, box_id_dtype ) - ndirect_sources_by_itgt_box = cl.array.zeros( - queue, ntarget_boxes, dtype=particle_id_dtype + ndirect_sources_by_itgt_box = actx.zeros( + ntarget_boxes, dtype=particle_id_dtype ) # List 1 @@ -926,7 +912,7 @@ def get_ndirect_sources_per_target_box(self, queue, traversal): # List 3 close if traversal.from_sep_close_smaller_starts is not None: - queue.finish() + actx.queue.finish() get_ndirect_sources_knl( ndirect_sources_by_itgt_box, traversal.from_sep_close_smaller_starts, @@ -936,7 +922,7 @@ def get_ndirect_sources_per_target_box(self, queue, traversal): # List 4 close if traversal.from_sep_close_bigger_starts is not None: - queue.finish() + actx.queue.finish() get_ndirect_sources_knl( ndirect_sources_by_itgt_box, traversal.from_sep_close_bigger_starts, @@ -946,28 +932,26 @@ def get_ndirect_sources_per_target_box(self, queue, traversal): return ndirect_sources_by_itgt_box - def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, actx: PyOpenCLArrayContext, + traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild - from pyopencl.array import take - ntargets_by_itgt_box = take( - box_target_counts_nonchild, - traversal.target_boxes, - queue=queue - ) - + ntargets_by_itgt_box = ( + actx.thaw(box_target_counts_nonchild)[traversal.target_boxes] + ) return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost # }}} # {{{ translate separated siblings' ("list 2") mpoles to local - @memoize_method - def process_list2_knl(self, context, box_id_dtype, box_level_dtype): + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_list2_knl(self, actx: PyOpenCLArrayContext, + box_id_dtype, box_level_dtype): return ElementwiseKernel( - context, + actx.context, Template(r""" double *nm2l, ${box_id_t} *target_or_target_parent_boxes, @@ -991,25 +975,24 @@ def process_list2_knl(self, context, box_id_dtype, box_level_dtype): name="process_list2" ) - def process_list2(self, queue, traversal, m2l_cost): + def process_list2(self, actx, traversal, m2l_cost): tree = traversal.tree box_id_dtype = tree.box_id_dtype box_level_dtype = tree.box_level_dtype ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) - nm2l = cl.array.zeros( - queue, (ntarget_or_target_parent_boxes,), dtype=np.float64 - ) + nm2l = actx.zeros((ntarget_or_target_parent_boxes,), dtype=np.float64) process_list2_knl = self.process_list2_knl( - queue.context, box_id_dtype, box_level_dtype + actx, box_id_dtype, box_level_dtype ) process_list2_knl( nm2l, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, tree.box_levels, - m2l_cost + m2l_cost, + queue=actx.queue, ) return nm2l @@ -1018,10 +1001,11 @@ def process_list2(self, queue, traversal, m2l_cost): # {{{ evaluate sep. smaller mpoles ("list 3") at particles - @memoize_method - def process_list3_knl(self, context, box_id_dtype, particle_id_dtype): + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_list3_knl(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype): return ElementwiseKernel( - context, + actx.context, Template(r""" ${box_id_t} *target_boxes_sep_smaller, ${box_id_t} *sep_smaller_start, @@ -1047,16 +1031,16 @@ def process_list3_knl(self, context, box_id_dtype, particle_id_dtype): name="process_list3" ) - def process_list3(self, queue, traversal, m2p_cost, + def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, box_target_counts_nonchild=None): tree = traversal.tree - nm2p = cl.array.zeros(queue, tree.nboxes, dtype=np.float64) + nm2p = actx.zeros(tree.nboxes, dtype=np.float64) if box_target_counts_nonchild is None: box_target_counts_nonchild = tree.box_target_counts_nonchild process_list3_knl = self.process_list3_knl( - queue.context, tree.box_id_dtype, tree.particle_id_dtype + actx, tree.box_id_dtype, tree.particle_id_dtype ) for ilevel, sep_smaller_list in enumerate( @@ -1065,9 +1049,9 @@ def process_list3(self, queue, traversal, m2p_cost, traversal.target_boxes_sep_smaller_by_source_level[ilevel], sep_smaller_list.starts, box_target_counts_nonchild, - m2p_cost[ilevel].get(queue=queue).reshape(-1)[0], + actx.to_numpy(m2p_cost[ilevel]).reshape(-1)[0], nm2p, - queue=queue + queue=actx.queue ) return nm2p @@ -1076,11 +1060,11 @@ def process_list3(self, queue, traversal, m2p_cost, # {{{ form locals for separated bigger source boxes ("list 4") - @memoize_method - def process_list4_knl(self, context, + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_list4_knl(self, actx: PyOpenCLArrayContext, box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - context, + actx.context, Template(r""" double *nm2p, ${box_id_t} *from_sep_bigger_starts, @@ -1110,15 +1094,13 @@ def process_list4_knl(self, context, name="process_list4" ) - def process_list4(self, queue, traversal, p2l_cost): + def process_list4(self, actx, traversal, p2l_cost): tree = traversal.tree target_or_target_parent_boxes = traversal.target_or_target_parent_boxes - nm2p = cl.array.zeros( - queue, len(target_or_target_parent_boxes), dtype=np.float64 - ) + nm2p = actx.zeros(len(target_or_target_parent_boxes), dtype=np.float64) process_list4_knl = self.process_list4_knl( - queue.context, + actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) @@ -1128,7 +1110,8 @@ def process_list4(self, queue, traversal, p2l_cost): traversal.from_sep_bigger_lists, tree.box_source_counts_nonchild, tree.box_levels, - p2l_cost + p2l_cost, + queue=actx.queue ) return nm2p @@ -1137,11 +1120,11 @@ def process_list4(self, queue, traversal, p2l_cost): # {{{ evaluate local expansions at targets - @memoize_method - def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype, - box_level_dtype): + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_eval_locals_knl(self, actx: PyOpenCLArrayContext, + box_id_dtype, particle_id_dtype, box_level_dtype): return ElementwiseKernel( - context, + actx.context, Template(r""" double *neval_locals, ${box_id_t} *target_boxes, @@ -1166,18 +1149,17 @@ def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype, name="process_eval_locals" ) - def process_eval_locals(self, queue, traversal, l2p_cost, + def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, box_target_counts_nonchild=None): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) - neval_locals = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64) + neval_locals = actx.zeros(ntarget_boxes, dtype=np.float64) if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild process_eval_locals_knl = self.process_eval_locals_knl( - queue.context, - tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype + actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype ) process_eval_locals_knl( @@ -1194,11 +1176,11 @@ def process_eval_locals(self, queue, traversal, l2p_cost, # {{{ propagate locals downward - @memoize_method - def process_refine_locals_knl(self, context, box_id_dtype): + @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:])) + def process_refine_locals_knl(self, actx: PyOpenCLArrayContext, box_id_dtype): from pyopencl.reduction import ReductionKernel return ReductionKernel( - context, + actx.context, np.float64, neutral="0.0", reduce_expr="a+b", @@ -1216,43 +1198,40 @@ def process_refine_locals_knl(self, context, box_id_dtype): name="process_refine_locals" ) - def process_refine_locals(self, queue, traversal, l2l_cost): + def process_refine_locals(self, actx: PyOpenCLArrayContext, + traversal, l2l_cost): tree = traversal.tree process_refine_locals_knl = self.process_refine_locals_knl( - queue.context, tree.box_id_dtype + actx, tree.box_id_dtype ) - level_start_target_or_target_parent_box_nrs = cl.array.to_device( - queue, traversal.level_start_target_or_target_parent_box_nrs + level_start_target_or_target_parent_box_nrs = actx.thaw( + traversal.level_start_target_or_target_parent_box_nrs ) cost = process_refine_locals_knl( level_start_target_or_target_parent_box_nrs, l2l_cost, range=slice(1, tree.nlevels) - ).get() + ) - return cost.reshape(-1)[0] + return actx.to_numpy(cost).reshape(-1)[0] # }}} - def zero_cost_per_box(self, queue, nboxes): - return cl.array.zeros(queue, (nboxes,), dtype=np.float64) + def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes): + return actx.zeros((nboxes,), dtype=np.float64) - def aggregate_over_boxes(self, per_box_result): + def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result): if isinstance(per_box_result, float): return per_box_result else: - return cl.array.sum(per_box_result).get().reshape(-1)[0] + return actx.to_numpy(actx.np.sum(per_box_result)).item() def fmm_cost_factors_for_kernels_from_model( - self, queue, nlevels, xlat_cost, context): - if not isinstance(queue, cl.CommandQueue): - raise TypeError( - "An OpenCL command queue must be supplied for cost model") - + self, actx: PyOpenCLArrayContext, nlevels, xlat_cost, context): return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( - self, queue, nlevels, xlat_cost, context + self, actx, nlevels, xlat_cost, context ) # }}} @@ -1261,7 +1240,8 @@ def fmm_cost_factors_for_kernels_from_model( # {{{ _PythonFMMCostModel (undocumented, only used for testing) class _PythonFMMCostModel(AbstractFMMCostModel): - def process_form_multipoles(self, queue, traversal, p2m_cost): + def process_form_multipoles(self, actx: PyOpenCLArrayContext, + traversal, p2m_cost): tree = traversal.tree np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64) @@ -1274,7 +1254,8 @@ def process_form_multipoles(self, queue, traversal, p2m_cost): return np2m - def get_ndirect_sources_per_target_box(self, queue, traversal): + def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext, + traversal): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) @@ -1308,7 +1289,8 @@ def get_ndirect_sources_per_target_box(self, queue, traversal): return ndirect_sources_by_itgt_box - def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost, + def process_direct(self, actx: PyOpenCLArrayContext, + traversal, ndirect_sources_by_itgt_box, p2p_cost, box_target_counts_nonchild=None): if box_target_counts_nonchild is None: box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild @@ -1317,7 +1299,7 @@ def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost - def process_list2(self, queue, traversal, m2l_cost): + def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost): tree = traversal.tree ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes) nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64) @@ -1330,7 +1312,7 @@ def process_list2(self, queue, traversal, m2l_cost): return nm2l - def process_list3(self, queue, traversal, m2p_cost, + def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost, box_target_counts_nonchild=None): tree = traversal.tree nm2p = np.zeros(tree.nboxes, dtype=np.float64) @@ -1348,7 +1330,7 @@ def process_list3(self, queue, traversal, m2p_cost, return nm2p - def process_list4(self, queue, traversal, p2l_cost): + def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost): tree = traversal.tree target_or_target_parent_boxes = traversal.target_or_target_parent_boxes nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64) @@ -1362,7 +1344,7 @@ def process_list4(self, queue, traversal, p2l_cost): return nm2p - def process_eval_locals(self, queue, traversal, l2p_cost, + def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost, box_target_counts_nonchild=None): tree = traversal.tree ntarget_boxes = len(traversal.target_boxes) @@ -1380,7 +1362,8 @@ def process_eval_locals(self, queue, traversal, l2p_cost, return neval_locals - def process_coarsen_multipoles(self, queue, traversal, m2m_cost): + def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext, + traversal, m2m_cost): tree = traversal.tree result = 0.0 @@ -1406,7 +1389,7 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost): return result - def process_refine_locals(self, queue, traversal, l2l_cost): + def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost): tree = traversal.tree result = 0.0 @@ -1418,17 +1401,17 @@ def process_refine_locals(self, queue, traversal, l2l_cost): return result - def zero_cost_per_box(self, queue, nboxes): + def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes): return np.zeros(nboxes, dtype=np.float64) - def aggregate_over_boxes(self, per_box_result): + def aggregate_over_boxes(self, actx, per_box_result): if isinstance(per_box_result, float): return per_box_result else: return np.sum(per_box_result) def fmm_cost_factors_for_kernels_from_model( - self, queue, nlevels, xlat_cost, context): + self, actx: PyOpenCLArrayContext, nlevels, xlat_cost, context): return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model( self, None, nlevels, xlat_cost, context ) From 003d0e0e16c3dbf4af084dff178f0f2327a13c91 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sun, 26 Jun 2022 18:39:47 +0300 Subject: [PATCH 14/28] port distributed to arraycontext --- boxtree/distributed/__init__.py | 48 +++-- boxtree/distributed/calculation.py | 247 +++++++++++++------------ boxtree/distributed/local_traversal.py | 22 +-- boxtree/distributed/local_tree.py | 208 +++++++++++---------- boxtree/distributed/partition.py | 128 ++++++------- boxtree/fmm.py | 2 +- 6 files changed, 336 insertions(+), 319 deletions(-) diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py index 495a47a6..73881352 100644 --- a/boxtree/distributed/__init__.py +++ b/boxtree/distributed/__init__.py @@ -88,7 +88,7 @@ Distributed Wrangler -------------------- -.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler +.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWranglerMixin .. _distributed-fmm-evaluation: @@ -97,8 +97,8 @@ The distributed version of the FMM evaluation shares the same interface as the shared-memory version. To evaluate FMM in a distributed manner, use a subclass -of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in -:func:`boxtree.fmm.drive_fmm`. +of :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin` +in :func:`boxtree.fmm.drive_fmm`. """ @@ -108,9 +108,7 @@ import numpy as np from mpi4py import MPI -import pyopencl as cl -import pyopencl.array - +from boxtree.array_context import PyOpenCLArrayContext from boxtree.cost import FMMCostModel @@ -128,9 +126,10 @@ class MPITags(enum.IntEnum): def dtype_to_mpi(dtype): - """ This function translates a numpy datatype into the corresponding type used in + """This function translates a numpy datatype into the corresponding type used in mpi4py. """ + if hasattr(MPI, "_typedict"): typedict = MPI._typedict elif hasattr(MPI, "__TypeDict__"): @@ -151,7 +150,7 @@ def dtype_to_mpi(dtype): # {{{ DistributedFMMRunner def make_distributed_wrangler( - queue, global_tree, traversal_builder, wrangler_factory, + actx: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory, calibration_params, comm): """Helper function for constructing the distributed wrangler on each rank. @@ -163,7 +162,6 @@ def make_distributed_wrangler( where the wrangler is constructed according to *wrangler_factory* and the indices are passed to :func:`boxtree.fmm.drive_fmm`. """ - mpi_rank = comm.Get_rank() # `tree_in_device_memory` is True if the global tree is in the device memory @@ -174,7 +172,7 @@ def make_distributed_wrangler( # worker ranks. tree_in_device_memory = None if mpi_rank == 0: - tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array) + tree_in_device_memory = isinstance(global_tree.targets[0], actx.array_types) tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0) # {{{ Broadcast the global tree @@ -182,7 +180,7 @@ def make_distributed_wrangler( global_tree_host = None if mpi_rank == 0: if tree_in_device_memory: - global_tree_host = global_tree.get(queue) + global_tree_host = actx.to_numpy(global_tree) else: global_tree_host = global_tree @@ -192,11 +190,11 @@ def make_distributed_wrangler( if mpi_rank == 0 and tree_in_device_memory: global_tree_dev = global_tree else: - global_tree_dev = global_tree_host.to_device(queue) - global_tree_dev = global_tree_dev.with_queue(queue) + global_tree_dev = actx.from_numpy(global_tree_host) + global_tree_dev = actx.thaw(global_tree_dev) - global_trav_dev, _ = traversal_builder(queue, global_tree_dev) - global_trav_host = global_trav_dev.get(queue) + global_trav_dev, _ = traversal_builder(actx, global_tree_dev) + global_trav_host = actx.to_numpy(global_trav_dev) global_trav = global_trav_dev if tree_in_device_memory else global_trav_host # }}} @@ -215,16 +213,16 @@ def make_distributed_wrangler( warnings.warn("Calibration parameters for the cost model are not " "supplied. The default one will be used.", stacklevel=2) - calibration_params = \ - FMMCostModel.get_unit_calibration_params() + calibration_params = FMMCostModel.get_unit_calibration_params() # We need to construct a wrangler in order to access `level_orders` global_wrangler = wrangler_factory(global_trav, global_trav) cost_per_box = cost_model.cost_per_box( - queue, global_trav_dev, global_wrangler.level_orders, + actx, global_trav_dev, global_wrangler.level_orders, calibration_params - ).get() + ) + cost_per_box = actx.to_numpy(cost_per_box) from boxtree.distributed.partition import partition_work responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm) @@ -235,7 +233,7 @@ def make_distributed_wrangler( from boxtree.distributed.local_tree import generate_local_tree local_tree, src_idx, tgt_idx = generate_local_tree( - queue, global_trav_host, responsible_boxes_list, comm) + actx, global_trav_dev, actx.from_numpy(responsible_boxes_list), comm) # }}} @@ -249,12 +247,12 @@ def make_distributed_wrangler( # {{{ Compute traversal object on each rank from boxtree.distributed.local_traversal import generate_local_travs - local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder) + local_trav_dev = generate_local_travs(actx, local_tree, traversal_builder) if not tree_in_device_memory: - local_trav = local_trav_dev.get(queue=queue) + local_trav = actx.to_numpy(local_trav_dev) else: - local_trav = local_trav_dev.with_queue(None) + local_trav = actx.freeze(local_trav_dev) # }}} @@ -269,7 +267,7 @@ class DistributedFMMRunner: .. automethod:: __init__ .. automethod:: drive_dfmm """ - def __init__(self, queue, global_tree, + def __init__(self, array_context: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory, calibration_params=None, comm=MPI.COMM_WORLD): @@ -292,7 +290,7 @@ def __init__(self, queue, global_tree, """ self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \ make_distributed_wrangler( - queue, global_tree, traversal_builder, wrangler_factory, + array_context, global_tree, traversal_builder, wrangler_factory, calibration_params, comm) def drive_dfmm(self, source_weights, timing_data=None): diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py index 3bda2d3b..22ad296d 100644 --- a/boxtree/distributed/calculation.py +++ b/boxtree/distributed/calculation.py @@ -29,13 +29,12 @@ from mako.template import Template from mpi4py import MPI -import pyopencl as cl from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype from pytools import memoize_method +from boxtree.array_context import PyOpenCLArrayContext from boxtree.distributed import MPITags -from boxtree.fmm import ExpansionWranglerInterface from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler @@ -44,35 +43,41 @@ # {{{ Distributed FMM wrangler -class DistributedExpansionWrangler(ExpansionWranglerInterface): - """Distributed expansion wrangler base class. +class DistributedExpansionWranglerMixin: + """Distributed expansion wrangler helper class. - This is an abstract class and should not be directly instantiated. Instead, it is - expected that all distributed wranglers should be subclasses of this class. + This class is meant to aid in adding distributed capabilities to wranglers. + All distributed wranglers should inherit from this class. + + .. attribute:: comm + .. attribute:: global_traversal + .. attribute:: communicate_mpoles_via_allreduce - .. automethod:: __init__ .. automethod:: distribute_source_weights .. automethod:: gather_potential_results .. automethod:: communicate_mpoles """ - def __init__(self, context, comm, global_traversal, - traversal_in_device_memory, - communicate_mpoles_via_allreduce=False): - self.context = context - self.comm = comm - self.global_traversal = global_traversal - self.traversal_in_device_memory = traversal_in_device_memory - self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() + @property + @memoize_method + def mpi_rank(self): + return self.comm.Get_rank() + + @property + @memoize_method + def mpi_size(self): + return self.comm.Get_size() + + @property + def is_mpi_root(self): + return self.mpi_rank == 0 - if mpi_rank == 0: + def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): + if self.is_mpi_root: distribute_weight_req = [] - local_src_weight_vecs = np.empty((mpi_size,), dtype=object) + local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object) - for irank in range(mpi_size): + for irank in range(self.mpi_size): local_src_weight_vecs[irank] = [ source_weights[src_idx_all_ranks[irank]] for source_weights in src_weight_vecs] @@ -91,22 +96,18 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): return local_src_weight_vecs def gather_potential_results(self, potentials, tgt_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() - from boxtree.distributed import dtype_to_mpi potentials_mpi_type = dtype_to_mpi(potentials.dtype) - gathered_potentials = None - if mpi_rank == 0: + if self.is_mpi_root: # The root rank received calculated potentials from all worker ranks - potentials_all_ranks = np.empty((mpi_size,), dtype=object) + potentials_all_ranks = np.empty((self.mpi_size,), dtype=object) potentials_all_ranks[0] = potentials recv_reqs = [] - for irank in range(1, mpi_size): + for irank in range(1, self.mpi_size): potentials_all_ranks[irank] = np.empty( tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype) @@ -121,7 +122,7 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks): gathered_potentials = np.empty( self.global_traversal.tree.ntargets, dtype=potentials.dtype) - for irank in range(mpi_size): + for irank in range(self.mpi_size): gathered_potentials[tgt_idx_all_ranks[irank]] = ( potentials_all_ranks[irank]) else: @@ -135,8 +136,13 @@ def _slice_mpoles(self, mpoles, slice_indices): if len(slice_indices) == 0: return np.empty((0,), dtype=mpoles.dtype) + level_start_box_nrs = self.traversal.tree.level_start_box_nrs + if not isinstance(level_start_box_nrs, np.ndarray): + actx = self.tree_indep._setup_actx + level_start_box_nrs = actx.to_numpy(level_start_box_nrs) + level_start_slice_indices = np.searchsorted( - slice_indices, self.traversal.tree.level_start_box_nrs) + slice_indices, level_start_box_nrs) mpoles_list = [] for ilevel in range(self.traversal.tree.nlevels): @@ -156,8 +162,13 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices): if len(slice_indices) == 0: return + level_start_box_nrs = self.traversal.tree.level_start_box_nrs + if not isinstance(level_start_box_nrs, np.ndarray): + actx = self.tree_indep._setup_actx + level_start_box_nrs = actx.to_numpy(level_start_box_nrs) + level_start_slice_indices = np.searchsorted( - slice_indices, self.traversal.tree.level_start_box_nrs) + slice_indices, level_start_box_nrs) mpole_updates_start = 0 for ilevel in range(self.traversal.tree.nlevels): @@ -178,60 +189,61 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices): mpole_updates_start = mpole_updates_end - @memoize_method - def find_boxes_used_by_subrange_kernel(self, box_id_dtype): - return ElementwiseKernel( - self.context, - Template(r""" - ${box_id_t} *contributing_boxes_list, - int subrange_start, - int subrange_end, - ${box_id_t} *box_to_user_rank_starts, - int *box_to_user_rank_lists, - char *box_in_subrange - """).render( - box_id_t=dtype_to_ctype(box_id_dtype), - ), - Template(r""" - ${box_id_t} ibox = contributing_boxes_list[i]; - ${box_id_t} iuser_start = box_to_user_rank_starts[ibox]; - ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1]; - for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) { - int useri = box_to_user_rank_lists[iuser]; - if(subrange_start <= useri && useri < subrange_end) { - box_in_subrange[i] = 1; + def find_boxes_used_by_subrange_kernel(self, actx, box_id_dtype): + from pytools import memoize_in + + @memoize_in(actx, (type(self), box_id_dtype)) + def get_kernel(): + return ElementwiseKernel( + actx.context, + Template(r""" + ${box_id_t} *contributing_boxes_list, + int subrange_start, + int subrange_end, + ${box_id_t} *box_to_user_rank_starts, + int *box_to_user_rank_lists, + char *box_in_subrange + """).render( + box_id_t=dtype_to_ctype(box_id_dtype), + ), + Template(r""" + ${box_id_t} ibox = contributing_boxes_list[i]; + ${box_id_t} iuser_start = box_to_user_rank_starts[ibox]; + ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1]; + for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) { + int useri = box_to_user_rank_lists[iuser]; + if(subrange_start <= useri && useri < subrange_end) { + box_in_subrange[i] = 1; + } } - } - """).render( - box_id_t=dtype_to_ctype(box_id_dtype) - ), - "find_boxes_used_by_subrange" - ) + """).render( + box_id_t=dtype_to_ctype(box_id_dtype) + ), + "find_boxes_used_by_subrange" + ) + + return get_kernel() def find_boxes_used_by_subrange( - self, subrange, box_to_user_rank_starts, box_to_user_rank_lists, + self, actx: PyOpenCLArrayContext, + subrange, box_to_user_rank_starts, box_to_user_rank_lists, contributing_boxes_list): """Test whether the multipole expansions of the contributing boxes are used by at least one box in a range. :arg subrange: the range is represented by ``(subrange[0], subrange[1])``. - :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object - indicating the start and end index in *box_to_user_rank_lists* for each + :arg box_to_user_rank_starts: an array object indicating the start and + end index in *box_to_user_rank_lists* for each box in + *contributing_boxes_list*. + :arg box_to_user_rank_lists: an array object storing the users of each box in *contributing_boxes_list*. - :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing - the users of each box in *contributing_boxes_list*. - :returns: a :class:`pyopencl.array.Array` object with the same shape as - *contributing_boxes_list*, where the i-th entry is 1 if - ``contributing_boxes_list[i]`` is used by at least on box in the - subrange specified. + :returns: an array object with the same shape as *contributing_boxes_list*, + where the i-th entry is 1 if ``contributing_boxes_list[i]`` is used + by at least on box in the subrange specified. """ - box_in_subrange = cl.array.zeros( - contributing_boxes_list.queue, - contributing_boxes_list.shape[0], - dtype=np.int8 - ) + box_in_subrange = actx.zeros(contributing_boxes_list.shape[0], dtype=np.int8) knl = self.find_boxes_used_by_subrange_kernel( - self.traversal.tree.box_id_dtype) + actx, self.traversal.tree.box_id_dtype) knl( contributing_boxes_list, @@ -244,7 +256,8 @@ def find_boxes_used_by_subrange( return box_in_subrange - def communicate_mpoles(self, mpole_exps, return_stats=False): + def communicate_mpoles(self, actx: PyOpenCLArrayContext, + mpole_exps, return_stats=False): """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_. The main idea is to mimic an allreduce as done on a hypercube network, but to @@ -253,12 +266,11 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston, Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc, - Lexing Ying, Denis Zorin, and George Biros. “A massively parallel - adaptive fast multipole method on heterogeneous architectures." - Communications of the ACM 55, no. 5 (2012): 101-109. + Lexing Ying, Denis Zorin, and George Biros. "A massively parallel + adaptive fast multipole method on heterogeneous architectures", + Communications of the ACM 55, no. 5 (2012): 101-109, + `DOI `__. """ - mpi_rank = self.comm.Get_rank() - mpi_size = self.comm.Get_size() tree = self.traversal.tree if self.communicate_mpoles_via_allreduce: @@ -284,16 +296,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): # Initially, this set consists of the boxes satisfying condition (a), which # are precisely the boxes owned by this process and their ancestors. if self.traversal_in_device_memory: - with cl.CommandQueue(self.context) as queue: - contributing_boxes = tree.ancestor_mask.get(queue=queue) - responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue) + contributing_boxes = actx.to_numpy(tree.ancestor_mask) + responsible_boxes_list = actx.to_numpy(tree.responsible_boxes_list) else: - contributing_boxes = tree.ancestor_mask.copy() + contributing_boxes = np.copy(tree.ancestor_mask) responsible_boxes_list = tree.responsible_boxes_list contributing_boxes[responsible_boxes_list] = 1 from boxtree.tools import AllReduceCommPattern - comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size) + comm_pattern = AllReduceCommPattern(self.mpi_rank, self.mpi_size) # Temporary buffers for receiving data mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype) @@ -303,15 +314,13 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): stats["bytes_recvd_by_stage"] = [] if self.traversal_in_device_memory: - box_to_user_rank_starts_dev = \ - tree.box_to_user_rank_starts.with_queue(None) - box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None) + box_to_user_rank_starts_dev = actx.freeze(tree.box_to_user_rank_starts) + box_to_user_rank_lists_dev = actx.freeze(tree.box_to_user_rank_lists) else: - with cl.CommandQueue(self.context) as queue: - box_to_user_rank_starts_dev = cl.array.to_device( - queue, tree.box_to_user_rank_starts).with_queue(None) - box_to_user_rank_lists_dev = cl.array.to_device( - queue, tree.box_to_user_rank_lists).with_queue(None) + box_to_user_rank_starts_dev = actx.freeze( + actx.from_numpy(tree.box_to_user_rank_starts)) + box_to_user_rank_lists_dev = actx.freeze( + actx.from_numpy(tree.box_to_user_rank_lists)) while not comm_pattern.done(): send_requests = [] @@ -325,18 +334,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): tree.box_id_dtype ) - with cl.CommandQueue(self.context) as queue: - contributing_boxes_list_dev = cl.array.to_device( - queue, contributing_boxes_list) - - box_in_subrange = self.find_boxes_used_by_subrange( - message_subrange, - box_to_user_rank_starts_dev, box_to_user_rank_lists_dev, - contributing_boxes_list_dev - ) - - box_in_subrange_host = box_in_subrange.get().astype(bool) + contributing_boxes_list_dev = actx.from_numpy( + contributing_boxes_list) + box_in_subrange = self.find_boxes_used_by_subrange( + actx, message_subrange, + box_to_user_rank_starts_dev, box_to_user_rank_lists_dev, + contributing_boxes_list_dev + ) + box_in_subrange_host = actx.to_numpy(box_in_subrange).astype(bool) relevant_boxes_list = contributing_boxes_list[ box_in_subrange_host ].astype(tree.box_id_dtype) @@ -385,7 +391,7 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): # Update data structures. self._update_mpoles( - mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes]) + mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes]) contributing_boxes[boxes_list_buf[:nboxes]] = 1 @@ -397,38 +403,43 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): if return_stats: return stats - def finalize_potentials(self, potentials, template_ary): - if self.comm.Get_rank() == 0: - return super().finalize_potentials(potentials, template_ary) - else: - return None - class DistributedFMMLibExpansionWrangler( - DistributedExpansionWrangler, FMMLibExpansionWrangler): + DistributedExpansionWranglerMixin, + FMMLibExpansionWrangler): def __init__( - self, context, comm, tree_indep, local_traversal, global_traversal, + self, comm, tree_indep, local_traversal, global_traversal, fmm_level_to_order=None, communicate_mpoles_via_allreduce=False, **kwargs): - DistributedExpansionWrangler.__init__( - self, context, comm, global_traversal, False, - communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) FMMLibExpansionWrangler.__init__( self, tree_indep, local_traversal, fmm_level_to_order=fmm_level_to_order, **kwargs) + self.comm = comm + self.traversal_in_device_memory = False + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce + # TODO: use log_process like FMMLibExpansionWrangler? def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return source_array[..., self.global_traversal.tree.user_source_ids] else: return None def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return potentials[self.global_traversal.tree.sorted_target_ids] else: return None + def finalize_potentials(self, potentials, template_ary): + if self.is_mpi_root: + return super().finalize_potentials(potentials, template_ary) + else: + return None + # }}} + +# vim: fdm=marker diff --git a/boxtree/distributed/local_traversal.py b/boxtree/distributed/local_traversal.py index 6c6fbc4a..dff4ee7e 100644 --- a/boxtree/distributed/local_traversal.py +++ b/boxtree/distributed/local_traversal.py @@ -29,34 +29,30 @@ def generate_local_travs( - queue, local_tree, traversal_builder, merge_close_lists=False): + actx, local_tree, traversal_builder, merge_close_lists=False): """Generate local traversal from local tree. - :arg queue: a :class:`pyopencl.CommandQueue` object. - :arg local_tree: the local tree of class - `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal - object will be constructed. - :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and - a tree, returns the traversal object based on the tree. + :arg local_tree: the local tree on which the local traversal object will + be constructed. + :arg traversal_builder: a function, taken a :class:`arraycontext.ArrayContext` + and a tree, returns the traversal object based on the tree. :return: generated local traversal object in device memory """ start_time = time.time() - local_tree.with_queue(queue) - # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the # multipole formation and upward propagation within the rank's responsible boxes # region. Had there not been such restrictions, some sources might be distributed # to more than 1 rank and counted multiple times. local_trav, _ = traversal_builder( - queue, local_tree.to_device(queue), - source_boxes_mask=local_tree.responsible_boxes_mask.device, - source_parent_boxes_mask=local_tree.ancestor_mask.device + actx, local_tree, + source_boxes_mask=local_tree.responsible_boxes_mask, + source_parent_boxes_mask=local_tree.ancestor_mask ) if merge_close_lists and local_tree.targets_have_extent: - local_trav = local_trav.merge_close_lists(queue) + local_trav = local_trav.merge_close_lists(actx) logger.info("Generate local traversal in %f sec.", time.time() - start_time) diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py index 91eacae9..1a5bb1a6 100644 --- a/boxtree/distributed/local_tree.py +++ b/boxtree/distributed/local_tree.py @@ -26,13 +26,15 @@ from dataclasses import dataclass import numpy as np +from arraycontext import Array, ArrayOrContainer from mako.template import Template -import pyopencl as cl +from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype from pytools import memoize_method from boxtree import Tree +from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container logger = logging.getLogger(__name__) @@ -48,16 +50,21 @@ class LocalTreeGeneratorCodeContainer: """Objects of this type serve as a place to keep the code needed for :func:`generate_local_tree`. """ - def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype): - self.cl_context = cl_context + def __init__(self, array_context: PyOpenCLArrayContext, + dimensions, particle_id_dtype, coord_dtype): + self._setup_actx = array_context self.dimensions = dimensions self.particle_id_dtype = particle_id_dtype self.coord_dtype = coord_dtype + @property + def context(self): + return self._setup_actx.context + @memoize_method def particle_mask_kernel(self): - return cl.elementwise.ElementwiseKernel( - self.cl_context, + return ElementwiseKernel( + self.context, arguments=Template(""" __global char *responsible_boxes, __global ${particle_id_t} *box_particle_starts, @@ -82,7 +89,7 @@ def particle_mask_kernel(self): def mask_scan_kernel(self): from pyopencl.scan import GenericScanKernel return GenericScanKernel( - self.cl_context, self.particle_id_dtype, + self.context, self.particle_id_dtype, arguments=Template(""" __global ${mask_t} *ary, __global ${mask_t} *scan @@ -123,8 +130,8 @@ def mask_scan_kernel(self): @memoize_method def fetch_local_particles_kernel(self, particles_have_extent): - return cl.elementwise.ElementwiseKernel( - self.cl_context, + return ElementwiseKernel( + self.context, self.fetch_local_particles_arguments.render( mask_t=dtype_to_ctype(self.particle_id_dtype), coord_t=dtype_to_ctype(self.coord_dtype), @@ -141,15 +148,15 @@ def fetch_local_particles_kernel(self, particles_have_extent): @memoize_method def mask_compressor_kernel(self): from boxtree.tools import MaskCompressorKernel - return MaskCompressorKernel(self.cl_context) + return MaskCompressorKernel(self._setup_actx) @memoize_method def modify_target_flags_kernel(self): from boxtree import box_flags_enum box_flag_t = dtype_to_ctype(box_flags_enum.dtype) - return cl.elementwise.ElementwiseKernel( - self.cl_context, + return ElementwiseKernel( + self.context, Template(""" __global ${particle_id_t} *box_target_counts_nonchild, __global ${particle_id_t} *box_target_counts_cumul, @@ -173,18 +180,19 @@ def modify_target_flags_kernel(self): ) -@dataclass +@dataclass(frozen=True) class LocalParticlesAndLists: - particles: np.ndarray - particle_radii: cl.array.Array | None - box_particle_starts: cl.array.Array - box_particle_counts_nonchild: cl.array.Array - box_particle_counts_cumul: cl.array.Array + particles: ArrayOrContainer + particle_radii: Array | None + box_particle_starts: Array + box_particle_counts_nonchild: Array + box_particle_counts_cumul: Array particle_idx: np.ndarray def construct_local_particles_and_lists( - queue, code, dimensions, num_boxes, num_global_particles, + actx: PyOpenCLArrayContext, + code, dimensions, num_boxes, num_global_particles, particle_id_dtype, coord_dtype, particles_have_extent, box_mask, global_particles, global_particle_radii, @@ -195,18 +203,19 @@ def construct_local_particles_and_lists( """ # {{{ calculate the particle mask - particle_mask = cl.array.zeros( - queue, num_global_particles, dtype=particle_id_dtype) - + particle_mask = actx.zeros(num_global_particles, dtype=particle_id_dtype) code.particle_mask_kernel()( - box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask) + box_mask, + box_particle_starts, + box_particle_counts_nonchild, + particle_mask) # }}} # {{{ calculate the scan of the particle mask - global_to_local_particle_index = cl.array.empty( - queue, num_global_particles + 1, dtype=particle_id_dtype) + global_to_local_particle_index = actx.np.zeros( + num_global_particles + 1, dtype=particle_id_dtype) global_to_local_particle_index[0] = 0 code.mask_scan_kernel()(particle_mask, global_to_local_particle_index) @@ -215,19 +224,18 @@ def construct_local_particles_and_lists( # {{{ fetch the local particles - num_local_particles = global_to_local_particle_index[-1].get(queue).item() - - local_particles = [ - cl.array.empty(queue, num_local_particles, dtype=coord_dtype) - for _ in range(dimensions)] + from pytools.obj_array import make_obj_array + num_local_particles = actx.to_numpy(global_to_local_particle_index[-1]).item() + local_particles = make_obj_array([ + actx.zeros(num_local_particles, coord_dtype) + for _ in range(dimensions) + ]) from pytools.obj_array import make_obj_array local_particles = make_obj_array(local_particles) - local_particle_radii = None if particles_have_extent: - local_particle_radii = cl.array.empty( - queue, num_local_particles, dtype=coord_dtype) + local_particle_radii = actx.np.zeros(num_local_particles, dtype=coord_dtype) code.fetch_local_particles_kernel(True)( particle_mask, global_to_local_particle_index, @@ -236,6 +244,7 @@ def construct_local_particles_and_lists( global_particle_radii, local_particle_radii) else: + local_particle_radii = None code.fetch_local_particles_kernel(False)( particle_mask, global_to_local_particle_index, *global_particles.tolist(), @@ -245,9 +254,9 @@ def construct_local_particles_and_lists( local_box_particle_starts = global_to_local_particle_index[box_particle_starts] - box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype) + box_counts_all_zeros = actx.zeros(num_boxes, dtype=particle_id_dtype) - local_box_particle_counts_nonchild = cl.array.if_positive( + local_box_particle_counts_nonchild = actx.np.where( box_mask, box_particle_counts_nonchild, box_counts_all_zeros) box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul @@ -258,18 +267,20 @@ def construct_local_particles_and_lists( # }}} - particle_mask = particle_mask.get(queue=queue).astype(bool) + particle_mask = actx.to_numpy(particle_mask).astype(bool) particle_idx = np.arange(num_global_particles)[particle_mask] return LocalParticlesAndLists( - local_particles, - local_particle_radii, - local_box_particle_starts, - local_box_particle_counts_nonchild, - local_box_particle_counts_cumul, - particle_idx) + particles=local_particles, + particle_radii=local_particle_radii, + box_particle_starts=local_box_particle_starts, + box_particle_counts_nonchild=local_box_particle_counts_nonchild, + box_particle_counts_cumul=local_box_particle_counts_cumul, + particle_idx=particle_idx) +@dataclass_array_container +@dataclass(frozen=True) class LocalTree(Tree): """ Inherits from :class:`boxtree.Tree`. @@ -288,13 +299,21 @@ class LocalTree(Tree): propagated from an ancestor) List 2. """ + box_to_user_rank_starts: Array + box_to_user_rank_lists: Array + + responsible_boxes_list: Array + responsible_boxes_mask: Array + ancestor_mask: Array + -def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): +def generate_local_tree( + actx: PyOpenCLArrayContext, + global_traversal, responsible_boxes_list, comm): """Generate the local tree for the current rank. This is an MPI-collective routine on *comm*. - :arg queue: a :class:`pyopencl.CommandQueue` object. :arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object on host memory. :arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the @@ -307,9 +326,9 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source weights from root rank and assembling calculated potentials on the root rank. """ - global_tree = global_traversal.tree + global_tree = actx.thaw(global_traversal.tree) code = LocalTreeGeneratorCodeContainer( - queue.context, global_tree.dimensions, + actx, global_tree.dimensions, global_tree.particle_id_dtype, global_tree.coord_dtype) mpi_rank = comm.Get_rank() @@ -318,33 +337,31 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): start_time = time.time() from boxtree.distributed.partition import get_box_masks - box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list) - - global_tree_dev = global_tree.to_device(queue).with_queue(queue) + box_masks = get_box_masks(actx, global_traversal, responsible_boxes_list) local_sources_and_lists = construct_local_particles_and_lists( - queue, code, global_tree.dimensions, global_tree.nboxes, + actx, code, global_tree.dimensions, global_tree.nboxes, global_tree.nsources, global_tree.particle_id_dtype, global_tree.coord_dtype, global_tree.sources_have_extent, box_masks.point_src_boxes, - global_tree_dev.sources, - global_tree_dev.sources_radii if global_tree.sources_have_extent else None, - global_tree_dev.box_source_starts, - global_tree_dev.box_source_counts_nonchild, - global_tree_dev.box_source_counts_cumul) + global_tree.sources, + global_tree.sources_radii if global_tree.sources_have_extent else None, + global_tree.box_source_starts, + global_tree.box_source_counts_nonchild, + global_tree.box_source_counts_cumul) local_targets_and_lists = construct_local_particles_and_lists( - queue, code, global_tree.dimensions, global_tree.nboxes, + actx, code, global_tree.dimensions, global_tree.nboxes, global_tree.ntargets, global_tree.particle_id_dtype, global_tree.coord_dtype, global_tree.targets_have_extent, box_masks.responsible_boxes, - global_tree_dev.targets, - global_tree_dev.target_radii if global_tree.targets_have_extent else None, - global_tree_dev.box_target_starts, - global_tree_dev.box_target_counts_nonchild, - global_tree_dev.box_target_counts_cumul) + global_tree.targets, + global_tree.target_radii if global_tree.targets_have_extent else None, + global_tree.box_target_starts, + global_tree.box_target_counts_nonchild, + global_tree.box_target_counts_cumul) # {{{ compute the users of multipole expansions of each box on the root rank @@ -354,24 +371,26 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): (mpi_size, global_tree.nboxes), dtype=box_masks.multipole_src_boxes.dtype) comm.Gather( - box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0) + actx.to_numpy(box_masks.multipole_src_boxes), + multipole_src_boxes_all_ranks, root=0) box_to_user_rank_starts = None box_to_user_rank_lists = None if mpi_rank == 0: - multipole_src_boxes_all_ranks = cl.array.to_device( - queue, multipole_src_boxes_all_ranks) + multipole_src_boxes_all_ranks = actx.from_numpy( + multipole_src_boxes_all_ranks) (box_to_user_rank_starts, box_to_user_rank_lists, evt) = \ code.mask_compressor_kernel()( - queue, multipole_src_boxes_all_ranks.transpose(), + actx, multipole_src_boxes_all_ranks.transpose(), list_dtype=np.int32) - cl.wait_for_events([evt]) + from pyopencl import wait_for_events + wait_for_events([evt]) - box_to_user_rank_starts = box_to_user_rank_starts.get() - box_to_user_rank_lists = box_to_user_rank_lists.get() + box_to_user_rank_starts = actx.to_numpy(box_to_user_rank_starts) + box_to_user_rank_lists = actx.to_numpy(box_to_user_rank_lists) logger.debug("computing box_to_user: done") @@ -388,7 +407,7 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): # expansions formed by sources in other ranks. Modifying the source box flags # could result in incomplete interaction lists. - local_box_flags = global_tree_dev.box_flags.copy(queue=queue) + local_box_flags = actx.np.copy(global_tree.box_flags) code.modify_target_flags_kernel()( local_targets_and_lists.box_particle_counts_nonchild, local_targets_and_lists.box_particle_counts_cumul, @@ -396,14 +415,6 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): # }}} - from pytools.obj_array import make_obj_array - local_sources = make_obj_array([ - local_sources_idim.get(queue=queue) - for local_sources_idim in local_sources_and_lists.particles]) - local_targets = make_obj_array([ - local_target_idim.get(queue=queue) - for local_target_idim in local_targets_and_lists.particles]) - local_tree = LocalTree( sources_are_targets=global_tree.sources_are_targets, sources_have_extent=global_tree.sources_have_extent, @@ -420,33 +431,34 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): bounding_box=global_tree.bounding_box, level_start_box_nrs=global_tree.level_start_box_nrs, - level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev, - sources=local_sources, - targets=local_targets, - source_radii=(local_sources_and_lists.particle_radii.get(queue=queue) + sources=local_sources_and_lists.particles, + targets=local_targets_and_lists.particles, + source_radii=( + local_sources_and_lists.particle_radii if global_tree.sources_have_extent else None), - target_radii=(local_targets_and_lists.particle_radii.get(queue=queue) + target_radii=( + local_targets_and_lists.particle_radii if global_tree.targets_have_extent else None), box_source_starts=( - local_sources_and_lists.box_particle_starts.get(queue=queue)), + local_sources_and_lists.box_particle_starts), box_source_counts_nonchild=( - local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)), + local_sources_and_lists.box_particle_counts_nonchild), box_source_counts_cumul=( - local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)), + local_sources_and_lists.box_particle_counts_cumul), box_target_starts=( - local_targets_and_lists.box_particle_starts.get(queue=queue)), + local_targets_and_lists.box_particle_starts), box_target_counts_nonchild=( - local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)), + local_targets_and_lists.box_particle_counts_nonchild), box_target_counts_cumul=( - local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)), + local_targets_and_lists.box_particle_counts_cumul), box_parent_ids=global_tree.box_parent_ids, box_child_ids=global_tree.box_child_ids, box_centers=global_tree.box_centers, box_levels=global_tree.box_levels, - box_flags=local_box_flags.get(queue=queue), + box_flags=local_box_flags, user_source_ids=None, sorted_target_ids=None, @@ -459,19 +471,19 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm): _is_pruned=global_tree._is_pruned, responsible_boxes_list=responsible_boxes_list, - responsible_boxes_mask=box_masks.responsible_boxes.get(), - ancestor_mask=box_masks.ancestor_boxes.get(), - box_to_user_rank_starts=box_to_user_rank_starts, - box_to_user_rank_lists=box_to_user_rank_lists + responsible_boxes_mask=box_masks.responsible_boxes, + ancestor_mask=box_masks.ancestor_boxes, + box_to_user_rank_starts=actx.from_numpy(box_to_user_rank_starts), + box_to_user_rank_lists=actx.from_numpy(box_to_user_rank_lists), ) - local_tree = local_tree.to_host_device_array(queue) - local_tree.with_queue(None) - - logger.info("Generate local tree on rank %d in %f sec.", - mpi_rank, time.time() - start_time) + logger.info("Generate local tree on rank %d in %s sec.", + mpi_rank, time.time() - start_time + ) return ( - local_tree, + actx.freeze(local_tree), local_sources_and_lists.particle_idx, local_targets_and_lists.particle_idx) + +# vim: fdm=marker diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py index d646f61b..95f40037 100644 --- a/boxtree/distributed/partition.py +++ b/boxtree/distributed/partition.py @@ -24,12 +24,15 @@ from dataclasses import dataclass import numpy as np +from arraycontext import Array from mako.template import Template -import pyopencl as cl +from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype from pytools import memoize_method +from boxtree.array_context import PyOpenCLArrayContext + def get_box_ids_dfs_order(tree): """Helper function for getting box ids of a tree in depth-first order. @@ -118,17 +121,21 @@ def partition_work(cost_per_box, traversal, comm): class GetBoxMasksCodeContainer: - def __init__(self, cl_context, box_id_dtype): - self.cl_context = cl_context + def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype): + self._setup_actx = array_context self.box_id_dtype = box_id_dtype + @property + def context(self): + return self._setup_actx.context + @memoize_method def add_interaction_list_boxes_kernel(self): """Given a ``responsible_boxes_mask`` and an interaction list, mark source boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask. """ - return cl.elementwise.ElementwiseKernel( - self.cl_context, + return ElementwiseKernel( + self.context, Template(""" __global ${box_id_t} *box_list, __global char *responsible_boxes_mask, @@ -154,29 +161,28 @@ def add_interaction_list_boxes_kernel(self): @memoize_method def add_parent_boxes_kernel(self): - return cl.elementwise.ElementwiseKernel( - self.cl_context, + return ElementwiseKernel( + self.context, "__global char *current, __global char *parent, " f"__global {dtype_to_ctype(self.box_id_dtype)} *box_parent_ids", "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1" ) -def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask): +def get_ancestor_boxes_mask(actx, code, traversal, responsible_boxes_mask): """Query the ancestors of responsible boxes. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose - i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` + is an ancestor of the responsible boxes specified by *responsible_boxes_mask*. """ - ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8) + ancestor_boxes = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) ancestor_boxes_last = responsible_boxes_mask.copy() while ancestor_boxes_last.any(): - ancestor_boxes_new = cl.array.zeros( - queue, (traversal.tree.nboxes,), dtype=np.int8) + ancestor_boxes_new = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) code.add_parent_boxes_kernel()( ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids) ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes) @@ -187,18 +193,18 @@ def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask): def get_point_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): + actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): """Query the boxes whose sources are needed in order to evaluate potentials of boxes represented by *responsible_boxes_mask*. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box - or an ancestor of the responsible boxes. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose - i-th entry is 1 if sources of box ``i`` are needed for evaluating the - potentials of targets in boxes represented by *responsible_boxes_mask*. + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :param ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is either a responsible box or an ancestor + of the responsible boxes. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if + sources of box ``i`` are needed for evaluating the potentials of targets + in boxes represented by *responsible_boxes_mask*. """ src_boxes_mask = responsible_boxes_mask.copy() @@ -208,7 +214,7 @@ def get_point_src_boxes_mask( traversal.target_boxes, responsible_boxes_mask, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_boxes_mask, - queue=queue) + queue=actx.queue) # Add list 4 of responsible boxes or ancestor boxes code.add_interaction_list_boxes_kernel()( @@ -216,7 +222,7 @@ def get_point_src_boxes_mask( responsible_boxes_mask | ancestor_boxes_mask, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_boxes_mask, - queue=queue) + queue=actx.queue) if traversal.tree.targets_have_extent: # Add list 3 close of responsible boxes @@ -227,7 +233,7 @@ def get_point_src_boxes_mask( traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, src_boxes_mask, - queue=queue + queue=actx.queue ) # Add list 4 close of responsible boxes @@ -238,30 +244,28 @@ def get_point_src_boxes_mask( traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, src_boxes_mask, - queue=queue + queue=actx.queue ) return src_boxes_mask def get_multipole_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): + actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask): """Query the boxes whose multipoles are used in order to evaluate potentials of targets in boxes represented by *responsible_boxes_mask*. - :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box. - :arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape - ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box - or an ancestor of the responsible boxes. - :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` - whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating - the potentials of targets in boxes represented by *responsible_boxes_mask*. + :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is a responsible box. + :arg ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose + i-th entry is 1 if ``i`` is either a responsible box or an ancestor of + the responsible boxes. + :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if + multipoles of box ``i`` are needed for evaluating the potentials of + targets in boxes represented by *responsible_boxes_mask*. """ - multipole_boxes_mask = cl.array.zeros( - queue, (traversal.tree.nboxes,), dtype=np.int8 - ) + multipole_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) # A mpole is used by process p if it is in the List 2 of either a box # owned by p or one of its ancestors. @@ -271,7 +275,7 @@ def get_multipole_src_boxes_mask( traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, multipole_boxes_mask, - queue=queue + queue=actx.queue ) multipole_boxes_mask.finish() @@ -283,7 +287,7 @@ def get_multipole_src_boxes_mask( traversal.from_sep_smaller_by_level[ilevel].starts, traversal.from_sep_smaller_by_level[ilevel].lists, multipole_boxes_mask, - queue=queue + queue=actx.queue ) multipole_boxes_mask.finish() @@ -291,11 +295,11 @@ def get_multipole_src_boxes_mask( return multipole_boxes_mask -@dataclass +@dataclass(frozen=True) class BoxMasks: """ - Box masks needed for the distributed calculation. Each of these masks is a - PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is + Box masks needed for the distributed calculation. Each of these masks is an + array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is set. .. attribute:: responsible_boxes @@ -315,13 +319,13 @@ class BoxMasks: Current process needs multipole expressions in these boxes. """ - responsible_boxes: cl.array.Array - ancestor_boxes: cl.array.Array - point_src_boxes: cl.array.Array - multipole_src_boxes: cl.array.Array + responsible_boxes: Array + ancestor_boxes: Array + point_src_boxes: Array + multipole_src_boxes: Array -def get_box_masks(queue, traversal, responsible_boxes_list): +def get_box_masks(actx, traversal, responsible_boxes_list): """Given the responsible boxes for a rank, this helper function calculates the relevant masks. @@ -329,27 +333,23 @@ def get_box_masks(queue, traversal, responsible_boxes_list): :returns: A :class:`BoxMasks` object of the relevant masks. """ - code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype) - - # FIXME: It is wasteful to copy the whole traversal object into device memory - # here because - # 1) Not all fields are needed. - # 2) For sumpy wrangler, a device traversal object is already available. - traversal = traversal.to_device(queue) + code = GetBoxMasksCodeContainer(actx, traversal.tree.box_id_dtype) - responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8) - responsible_boxes_mask[responsible_boxes_list] = 1 - responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask) + responsible_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8) + responsible_boxes_mask[responsible_boxes_list] = ( + 1 + actx.zeros(responsible_boxes_list.shape, np.int8)) ancestor_boxes_mask = get_ancestor_boxes_mask( - queue, code, traversal, responsible_boxes_mask) + actx, code, traversal, responsible_boxes_mask) point_src_boxes_mask = get_point_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) + actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) multipole_src_boxes_mask = get_multipole_src_boxes_mask( - queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) + actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask) return BoxMasks( - responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask, + responsible_boxes_mask, + ancestor_boxes_mask, + point_src_boxes_mask, multipole_src_boxes_mask) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 3c4da9a1..6c13290c 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -341,7 +341,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, :arg expansion_wrangler: An object exhibiting the :class:`ExpansionWranglerInterface`. For distributed implementation, this wrangler should be a subclass of - :class:`boxtree.distributed.calculation.DistributedExpansionWrangler`. + :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`. :arg src_weight_vecs: A sequence of source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. For distributed implementation, this argument is only significant on the root rank, but From c1a9c7fdf03dcdf4991dac11c6021bbf789308c5 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 22 Jun 2022 16:07:10 +0300 Subject: [PATCH 15/28] add tests for DeviceDataRecord based on arraycontext --- boxtree/array_context.py | 161 ++++++++++++++++++++++++++++++++++++--- test/test_tools.py | 54 +++++++++++-- 2 files changed, 201 insertions(+), 14 deletions(-) diff --git a/boxtree/array_context.py b/boxtree/array_context.py index 74e60eda..5fe85c5c 100644 --- a/boxtree/array_context.py +++ b/boxtree/array_context.py @@ -20,25 +20,28 @@ THE SOFTWARE. """ -from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase +import numpy as np +from arraycontext import ( # noqa: F401 + PyOpenCLArrayContext as PyOpenCLArrayContextBase, + deserialize_container, + rec_map_array_container, + serialize_container, + with_array_context, +) from arraycontext.pytest import ( _PytestPyOpenCLArrayContextFactoryWithClass, register_pytest_array_context_factory, ) +from pyopencl.algorithm import BuiltList + __doc__ = """ .. autoclass:: PyOpenCLArrayContext """ -def _acf(): - import pyopencl as cl - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - - return PyOpenCLArrayContext(queue, force_device_scalars=True) - +# {{{ array context class PyOpenCLArrayContext(PyOpenCLArrayContextBase): def transform_loopy_program(self, t_unit): @@ -51,7 +54,143 @@ def transform_loopy_program(self, t_unit): "Did you use arraycontext.make_loopy_program " "to create this kernel?") - return super().transform_loopy_program(t_unit) + return t_unit + + # NOTE: _rec_map_container is copied from arraycontext wholesale and should + # be kept in sync as much as possible! + + def _rec_map_container(self, func, array, allowed_types=None, *, + default_scalar=None, strict=False): + import arraycontext.impl.pyopencl.taggable_cl_array as tga + + if allowed_types is None: + allowed_types = (tga.TaggableCLArray,) + + def _wrapper(ary): + # NOTE: this is copied verbatim from arraycontext and this is the + # only change to allow optional fields inside containers + if ary is None: + return ary + + if isinstance(ary, allowed_types): + return func(ary) + elif not strict and isinstance(ary, self.array_types): + from warnings import warn + warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with " + f"{type(ary).__name__} will be unsupported in 2025. Use " + "'to_tagged_cl_array' to convert instances to TaggableCLArray.", + DeprecationWarning, stacklevel=2) + return func(tga.to_tagged_cl_array(ary)) + elif np.isscalar(ary): + if default_scalar is None: + return ary + else: + return np.array(ary).dtype.type(default_scalar) + else: + raise TypeError( + f"{type(self).__name__}.{func.__name__[1:]} invoked with " + f"an unsupported array type: got '{type(ary).__name__}', " + f"but expected one of {allowed_types}") + + return rec_map_array_container(_wrapper, array) + +# }}} + + +# {{{ dataclass array container + +def dataclass_array_container(cls: type) -> type: + """A decorator based on :func:`arraycontext.dataclass_array_container` + that allows :class:`typing.Optional` containers. + """ + + from dataclasses import Field, fields, is_dataclass + from types import UnionType + from typing import Union, get_args, get_origin + + from arraycontext.container.dataclass import ( + _inject_dataclass_serialization, + is_array_type, + ) + + assert is_dataclass(cls) + + def is_array_field(f: Field) -> bool: + origin = get_origin(f.type) + if origin in (Union, UnionType): + return all( + (is_array_type(arg) or arg is type(None)) + for arg in get_args(f.type)) + + if isinstance(f.type, str): + raise TypeError( + f"String annotation on field '{f.name}' not supported. " + "(this may be due to 'from __future__ import annotations')") + + if not f.init: + raise ValueError( + f"Field with 'init=False' not allowed: '{f.name}'") + + # NOTE: + # * GenericAlias catches `list`, `tuple`, etc. + # * `_BaseGenericAlias` catches `List`, `Tuple`, `Callable`, etc. + # * `_SpecialForm` catches `Any`, `Literal`, `Optional`, etc. + from types import GenericAlias + from typing import ( # type: ignore[attr-defined] + _BaseGenericAlias, + _SpecialForm, + ) + if isinstance(f.type, GenericAlias | _BaseGenericAlias | _SpecialForm): + # NOTE: anything except a Union is not an array + return False + + return is_array_type(f.type) + + from pytools import partition + array_fields, non_array_fields = partition(is_array_field, fields(cls)) + + if not array_fields: + raise ValueError(f"'{cls}' must have fields with array container type " + "in order to use the 'dataclass_array_container' decorator") + + return _inject_dataclass_serialization(cls, array_fields, non_array_fields) + +# }}} + + +# {{{ serialization + +# NOTE: BuiltList is serialized explicitly here because pyopencl cannot depend +# on arraycontext machinery. + +@serialize_container.register(BuiltList) +def _serialize_built_list(obj: BuiltList): + return ( + ("starts", obj.starts), + ("lists", obj.lists), + ("nonempty_indices", obj.nonempty_indices), + ("compressed_indices", obj.compressed_indices), + ) + + +@deserialize_container.register(BuiltList) +def _deserialize_built_list(template: BuiltList, iterable): + return type(template)( + count=template.count, + num_nonempty_lists=template.num_nonempty_lists, + **dict(iterable)) + +# }}} + + +# {{{ pytest + +def _acf(): + import pyopencl as cl + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + return PyOpenCLArrayContext(queue, force_device_scalars=True) class PytestPyOpenCLArrayContextFactory( @@ -61,3 +200,7 @@ class PytestPyOpenCLArrayContextFactory( register_pytest_array_context_factory("boxtree.pyopencl", PytestPyOpenCLArrayContextFactory) + +# }}} + +# vim: fdm=marker diff --git a/test/test_tools.py b/test/test_tools.py index e75af7f3..16b65307 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -98,7 +98,7 @@ def test_masked_matrix_compression(actx_factory, order): actx = actx_factory() from boxtree.tools import MaskCompressorKernel - matcompr = MaskCompressorKernel(actx.context) + matcompr = MaskCompressorKernel(actx) n = 40 m = 10 @@ -107,7 +107,7 @@ def test_masked_matrix_compression(actx_factory, order): arr = (rng.random((n, m)) > 0.5).astype(np.int8).copy(order=order) d_arr = actx.from_numpy(arr) - arr_starts, arr_lists, _evt = matcompr(actx.queue, d_arr) + arr_starts, arr_lists, _evt = matcompr(actx, d_arr) arr_starts = actx.to_numpy(arr_starts) arr_lists = actx.to_numpy(arr_lists) @@ -125,14 +125,14 @@ def test_masked_list_compression(actx_factory): rng = np.random.default_rng(seed=42) from boxtree.tools import MaskCompressorKernel - listcompr = MaskCompressorKernel(actx.context) + listcompr = MaskCompressorKernel(actx) n = 20 arr = (rng.random(n) > 0.5).astype(np.int8) d_arr = actx.from_numpy(arr) - arr_list, _evt = listcompr(actx.queue, d_arr) + arr_list, _evt = listcompr(actx, d_arr) arr_list = actx.to_numpy(arr_list) assert set(arr_list) == set(arr.nonzero()[0]) @@ -165,6 +165,50 @@ def test_device_record(actx_factory): for i in range(3): assert np.array_equal(record_host.obj_array[i], record.obj_array[i]) + +def test_device_record_array_context(actx_factory): + actx = actx_factory() + + from dataclasses import dataclass + + from arraycontext import Array + + from boxtree.array_context import dataclass_array_container + + @dataclass_array_container + @dataclass(frozen=True) + class MyDeviceDataRecord: + array: Array + obj_array: np.ndarray + opt_array: Array | None + value: float + + from pytools.obj_array import make_obj_array + rng = np.random.default_rng() + record = MyDeviceDataRecord( + array=rng.random(128), + obj_array=make_obj_array([rng.random(128) for _ in range(3)]), + opt_array=None, + value=3) + + actx_record = actx.from_numpy(record) + assert actx_record.array.queue is actx.queue + + frozen_record = actx.freeze(actx_record) + assert frozen_record.array.queue is None + + thawed_record = actx.thaw(frozen_record) + assert actx_record.array.queue is actx.queue + + host_record = actx.to_numpy(thawed_record) + assert isinstance(host_record.array, np.ndarray) + + assert record.value == host_record.value + assert np.allclose(record.array, host_record.array) + assert np.all([ + np.allclose(record.obj_array[i], host_record.obj_array[i]) for i in range(3) + ]) + # }}} @@ -176,7 +220,7 @@ def test_device_record(actx_factory): def test_particle_array(actx_factory, array_factory, dim, dtype): actx = actx_factory() - particles = array_factory(actx.queue, 1000, dim, dtype) + particles = array_factory(actx, 1000, dim, dtype) assert len(particles) == dim assert all(len(particles[0]) == len(axis) for axis in particles) From cdee1ac43b438e623f4baf07f9069cdf42395c78 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 22 Jun 2022 16:13:02 +0300 Subject: [PATCH 16/28] port test_traversal to arraycontext --- test/test_traversal.py | 53 +++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index 3988f46a..6de51b36 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -54,22 +54,21 @@ def test_tree_connectivity(actx_factory, dims, sources_are_targets): actx = actx_factory() dtype = np.float64 - sources = make_normal_particle_array(actx.queue, 1 * 10**5, dims, dtype) + sources = make_normal_particle_array(actx, 1 * 10**5, dims, dtype) if sources_are_targets: targets = None else: - targets = make_normal_particle_array(actx.queue, 2 * 10**5, dims, dtype) + targets = make_normal_particle_array(actx, 2 * 10**5, dims, dtype) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, - targets=targets, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, max_particles_in_box=30, targets=targets, debug=True) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context) - trav, _ = tg(actx.queue, tree, debug=True) - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + tg = FMMTraversalBuilder(actx) + trav, _ = tg(actx, tree, debug=True) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) levels = tree.box_levels parents = tree.box_parent_ids.T @@ -286,17 +285,15 @@ def test_plot_traversal(actx_factory, well_sep_is_n_away=1, visualize=False): for i in range(dims)]) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away) - trav, _ = tg(actx.queue, tree) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=well_sep_is_n_away) + trav, _ = tg(actx, tree) - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) @@ -340,10 +337,8 @@ def test_from_sep_siblings_translation_and_rotation_classes( for i in range(dims)]) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) # }}} @@ -353,14 +348,14 @@ def test_from_sep_siblings_translation_and_rotation_classes( from boxtree.translation_classes import TranslationClassesBuilder from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away) - trav, _ = tg(actx.queue, tree) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=well_sep_is_n_away) + trav, _ = tg(actx, tree) - rb = RotationClassesBuilder(actx.context) - result, _ = rb(actx.queue, trav, tree) + rb = RotationClassesBuilder(actx) + result, _ = rb(actx, trav, tree) - tb = TranslationClassesBuilder(actx.context) - result_tb, _ = tb(actx.queue, trav, tree) + tb = TranslationClassesBuilder(actx) + result_tb, _ = tb(actx, trav, tree) rot_classes = actx.to_numpy( result.from_sep_siblings_rotation_classes) @@ -372,8 +367,8 @@ def test_from_sep_siblings_translation_and_rotation_classes( distance_vectors = actx.to_numpy( result_tb.from_sep_siblings_translation_class_to_distance_vector) - tree = tree.get(queue=actx.queue) - trav = trav.get(queue=actx.queue) + tree = actx.to_numpy(tree) + trav = actx.to_numpy(trav) centers = tree.box_centers.T From e112f175f0ef1456ae3e92bc02f76488a006500e Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Thu, 23 Jun 2022 16:25:04 +0300 Subject: [PATCH 17/28] port test_tree to arraycontext --- test/test_tree.py | 175 +++++++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 97 deletions(-) diff --git a/test/test_tree.py b/test/test_tree.py index 1579464c..aa6076f3 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -27,11 +27,8 @@ import pytest from arraycontext import pytest_generate_tests_for_array_contexts -from boxtree.array_context import ( - PytestPyOpenCLArrayContextFactory, - _acf, # noqa: F401 -) -from boxtree.tools import make_normal_particle_array +from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf # noqa: F401 +from boxtree.tools import AXIS_NAMES, make_normal_particle_array logger = logging.getLogger(__name__) @@ -50,18 +47,17 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles): actx = actx_factory() from boxtree.bounding_box import BoundingBoxFinder - from boxtree.tools import AXIS_NAMES - bbf = BoundingBoxFinder(actx.context) + bbf = BoundingBoxFinder(actx) axis_names = AXIS_NAMES[:dims] logger.info("%s - %s %s", dtype, dims, nparticles) - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) bbox_min = [np.min(actx.to_numpy(x)) for x in particles] bbox_max = [np.max(actx.to_numpy(x)) for x in particles] - bbox_cl, _evt = bbf(particles, radii=None) + bbox_cl, _evt = bbf(actx, particles, radii=None) bbox_cl = actx.to_numpy(bbox_cl) bbox_min_cl = np.empty(dims, dtype) @@ -104,21 +100,19 @@ def run_build_test(builder, actx, dims, dtype, nparticles, visualize, logger.info(75 * "-") - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt np_particles = actx.to_numpy(particles) pt.plot(np_particles[0], np_particles[1], "x") - actx.queue.finish() - - tree, _ = builder(actx.queue, particles, + tree, _ = builder(actx, particles, max_particles_in_box=max_particles_in_box, refine_weights=refine_weights, max_leaf_refine_weight=max_leaf_refine_weight, debug=True, **kwargs) - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) sorted_particles = np.array(list(tree.sources)) @@ -237,7 +231,7 @@ def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 4, max_particles_in_box=30, visualize=visualize) @@ -248,7 +242,7 @@ def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 50, max_particles_in_box=30, visualize=visualize) @@ -259,7 +253,7 @@ def test_unpruned_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) # test unpruned tree build run_build_test(builder, actx, dims, dtype, 10**5, @@ -272,7 +266,7 @@ def test_particle_tree_with_reallocations( actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 10**5, max_particles_in_box=30, visualize=visualize, nboxes_guess=5) @@ -284,7 +278,7 @@ def test_particle_tree_with_many_empty_leaves( actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 10**5, max_particles_in_box=5, visualize=visualize) @@ -295,7 +289,7 @@ def test_vanilla_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 10**5, max_particles_in_box=30, visualize=visualize) @@ -307,7 +301,7 @@ def test_explicit_refine_weights_particle_tree( actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) nparticles = 10**5 @@ -326,7 +320,7 @@ def test_non_adaptive_particle_tree(actx_factory, dtype, dims, visualize=False): actx = actx_factory() from boxtree import TreeBuilder - builder = TreeBuilder(actx.context) + builder = TreeBuilder(actx) run_build_test(builder, actx, dims, dtype, 10**4, max_particles_in_box=30, visualize=visualize, kind="non-adaptive") @@ -345,9 +339,9 @@ def test_source_target_tree(actx_factory, dims, visualize=False): ntargets = 3 * 10**5 dtype = np.float64 - sources = make_normal_particle_array(actx.queue, nsources, dims, dtype, + sources = make_normal_particle_array(actx, nsources, dims, dtype, seed=12) - targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype, + targets = make_normal_particle_array(actx, ntargets, dims, dtype, seed=19) if visualize: @@ -358,12 +352,11 @@ def test_source_target_tree(actx_factory, dims, visualize=False): pt.show() from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - actx.queue.finish() - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=10, debug=True) - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) sorted_sources = np.array(list(tree.sources)) sorted_targets = np.array(list(tree.targets)) @@ -457,9 +450,9 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): dtype = np.float64 npoint_sources_per_source = 16 - sources = make_normal_particle_array(actx.queue, nsources, dims, dtype, + sources = make_normal_particle_array(actx, nsources, dims, dtype, seed=12) - targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype, + targets = make_normal_particle_array(actx, ntargets, dims, dtype, seed=19) refine_weights = actx.np.zeros(nsources + ntargets, np.int32) @@ -474,10 +467,10 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): ) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) actx.queue.finish() - dev_tree, _ = tb(actx.queue, sources, targets=targets, + dev_tree, _ = tb(actx, sources, targets=targets, source_radii=source_radii, target_radii=target_radii, extent_norm=extent_norm, @@ -495,7 +488,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): logger.info("transfer tree, check orderings") - tree = dev_tree.get(queue=actx.queue) + tree = actx.to_numpy(dev_tree) if visualize: import matplotlib.pyplot as pt @@ -657,7 +650,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False): ) from boxtree.tree import link_point_sources - dev_tree = link_point_sources(actx.queue, dev_tree, + dev_tree = link_point_sources(actx, dev_tree, point_source_starts, point_sources, debug=True) @@ -677,7 +670,7 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt @@ -685,23 +678,23 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False): pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) + tree = actx.thaw(tree) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.np.zeros(nballs, dtype) from boxtree.area_query import LeavesToBallsLookupBuilder - lblb = LeavesToBallsLookupBuilder(actx.context) + lblb = LeavesToBallsLookupBuilder(actx) - lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii) + lbl, _ = lblb(actx, tree, ball_centers, ball_radii) # get data to host for test - tree = tree.get(queue=actx.queue) - lbl = lbl.get(queue=actx.queue) + tree = actx.to_numpy(tree) + lbl = actx.to_numpy(lbl) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T ball_radii = actx.to_numpy(ball_radii) @@ -734,13 +727,12 @@ def run_area_query_test(actx, tree, ball_centers, ball_radii): Performs an area query and checks that the result is as expected. """ from boxtree.area_query import AreaQueryBuilder - aqb = AreaQueryBuilder(actx.context) - - area_query, _ = aqb(actx.queue, tree, ball_centers, ball_radii) + aqb = AreaQueryBuilder(actx) + area_query, _ = aqb(actx, tree, ball_centers, ball_radii) # Get data to host for test. - tree = tree.get(queue=actx.queue) - area_query = area_query.get(queue=actx.queue) + tree = actx.to_numpy(tree) + area_query = actx.to_numpy(area_query) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T ball_radii = actx.to_numpy(ball_radii) @@ -781,7 +773,7 @@ def test_area_query(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt @@ -789,13 +781,11 @@ def test_area_query(actx_factory, dims, visualize=False): pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.np.zeros(nballs, dtype) run_area_query_test(actx, tree, ball_centers, ball_radii) @@ -814,7 +804,7 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False): nparticles = 10**4 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt @@ -822,10 +812,8 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False): pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 bbox_min = tree.bounding_box[0].min() @@ -851,7 +839,7 @@ def test_area_query_elwise(actx_factory, dims, visualize=False): nparticles = 10**5 dtype = np.float64 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt @@ -859,13 +847,11 @@ def test_area_query_elwise(actx_factory, dims, visualize=False): pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.np.zeros(nballs, dtype) from boxtree.area_query import AreaQueryElementwiseTemplate, PeerListFinder @@ -885,10 +871,10 @@ def test_area_query_elwise(actx_factory, dims, visualize=False): """, leaf_found_op="") - peer_lists, evt = PeerListFinder(actx.context)(actx.queue, tree) + peer_lists, evt = PeerListFinder(actx)(actx, tree) kernel = template.generate( - actx.context, + actx.queue.context, dims, tree.coord_dtype, tree.box_id_dtype, @@ -919,8 +905,7 @@ def test_level_restriction( dtype = np.float64 from boxtree.tools import make_surface_particle_array - particles = make_surface_particle_array( - actx.queue, nparticles, dims, dtype, seed=15) + particles = make_surface_particle_array(actx, nparticles, dims, dtype, seed=15) if visualize: import matplotlib.pyplot as pt @@ -928,10 +913,8 @@ def test_level_restriction( pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree_dev, _ = tb(actx.queue, particles, + tb = TreeBuilder(actx) + tree_dev, _ = tb(actx, particles, kind="adaptive-level-restricted", max_particles_in_box=30, debug=True, skip_prune=skip_prune, lr_lookbehind=lookbehind, @@ -946,18 +929,18 @@ def find_neighbors(leaf_box_centers, leaf_box_radii): # Note that since this comes from an area query, the self box will be # included in the neighbor list. from boxtree.area_query import AreaQueryBuilder - aqb = AreaQueryBuilder(actx.context) + aqb = AreaQueryBuilder(actx) ball_radii = actx.from_numpy(np.min(leaf_box_radii) / 2 + leaf_box_radii) leaf_box_centers = [actx.from_numpy(axis) for axis in leaf_box_centers] - area_query, _ = aqb(actx.queue, tree_dev, leaf_box_centers, ball_radii) - area_query = area_query.get(queue=actx.queue) + area_query, _ = aqb(actx, tree_dev, leaf_box_centers, ball_radii) + area_query = actx.to_numpy(area_query) return (area_query.leaves_near_ball_starts, area_query.leaves_near_ball_lists) # Get data to host for test. - tree = tree_dev.get(queue=actx.queue) + tree = actx.to_numpy(tree_dev) # Find leaf boxes. from boxtree import box_flags_enum @@ -1001,7 +984,7 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False): dtype = np.dtype(dtype) nparticles = 10**5 - particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype) + particles = make_normal_particle_array(actx, nparticles, dims, dtype) if visualize: import matplotlib.pyplot as pt @@ -1009,29 +992,27 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False): pt.plot(np_particles[0], np_particles[1], "x") from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - actx.queue.finish() - tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True) nballs = 10**4 - ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype) + ball_centers = make_normal_particle_array(actx, nballs, dims, dtype) ball_radii = 0.1 + actx.np.zeros(nballs, dtype) from boxtree.area_query import LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder - siqb = SpaceInvaderQueryBuilder(actx.context) + siqb = SpaceInvaderQueryBuilder(actx) # We can use leaves-to-balls lookup to get the set of overlapping balls for # each box, and from there to compute the outer space invader distance. - lblb = LeavesToBallsLookupBuilder(actx.context) + lblb = LeavesToBallsLookupBuilder(actx) - siq, _ = siqb(actx.queue, tree, ball_centers, ball_radii) - lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii) + siq, _ = siqb(actx, tree, ball_centers, ball_radii) + lbl, _ = lblb(actx, tree, ball_centers, ball_radii) # get data to host for test - tree = tree.get(queue=actx.queue) - siq = siq.get(queue=actx.queue) - lbl = lbl.get(queue=actx.queue) + tree = actx.to_numpy(tree) + siq = actx.to_numpy(siq) + lbl = actx.to_numpy(lbl) ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]) ball_radii = actx.to_numpy(ball_radii) @@ -1062,7 +1043,7 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False): @pytest.mark.opencl @pytest.mark.parametrize("dims", [2, 3]) -def test_same_tree_with_zero_weight_particles(actx_factory, dims): +def test_same_tree_with_zero_weight_particles(actx_factory, dims, visualize=False): actx = actx_factory() ntargets_values = [300, 400, 500] @@ -1070,7 +1051,7 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims): nsources = 20 from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) trees = [] @@ -1091,18 +1072,18 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims): refine_weights[:nsources] = 1 refine_weights[nsources:] = 0 - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=stick_out_factor, max_leaf_refine_weight=10, refine_weights=refine_weights, debug=True) - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) trees.append(tree) print("TREE:", tree.nboxes) - if 0: + if visualize: import matplotlib.pyplot as plt for tree in trees: plt.figure() @@ -1119,12 +1100,12 @@ def test_max_levels_error(actx_factory): actx = actx_factory() from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) sources = [actx.np.zeros(11, np.float64) for i in range(2)] from boxtree.tree_build import MaxLevelsExceeded with pytest.raises(MaxLevelsExceeded): - _tree, _ = tb(actx.queue, sources, max_particles_in_box=10, debug=True) + _tree, _ = tb(actx, sources, max_particles_in_box=10, debug=True) # }}} From 2b0bd5fa7c5b7b62f07913c747298e1e44d7ecde Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Thu, 23 Jun 2022 21:58:11 +0300 Subject: [PATCH 18/28] port test_fmm to arraycontext --- test/test_fmm.py | 114 ++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 55 deletions(-) diff --git a/test/test_fmm.py b/test/test_fmm.py index 615e7a1c..659ae64e 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -178,7 +178,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, dtype = np.float64 try: - sources = source_gen(actx.queue, nsources_req, dims, dtype, seed=15) + sources = source_gen(actx, nsources_req, dims, dtype, seed=15) nsources = len(sources[0]) if ntargets_req is None: @@ -186,7 +186,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, targets = None ntargets = ntargets_req else: - targets = target_gen(actx.queue, ntargets_req, dims, dtype, seed=16) + targets = target_gen(actx, ntargets_req, dims, dtype, seed=16) ntargets = len(targets[0]) except ImportError: pytest.skip("loopy not available, but needed for particle array " @@ -208,40 +208,40 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, target_radii = None from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, source_radii=source_radii, target_radii=target_radii, debug=True, stick_out_factor=0.25, extent_norm=extent_norm) if 0: - tree = tree.get(queue=actx.queue) + tree = actx.to_numpy(tree) tree.plot() import matplotlib.pyplot as pt pt.show() from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context, + tbuild = FMMTraversalBuilder(actx, well_sep_is_n_away=well_sep_is_n_away, from_sep_smaller_crit=from_sep_smaller_crit) - trav, _ = tbuild(actx.queue, tree, debug=True) + trav, _ = tbuild(actx, tree, debug=True) if who_has_extent: pre_merge_trav = trav - trav = trav.merge_close_lists(actx.queue) + trav = trav.merge_close_lists(actx) # weights = np.random.randn(nsources) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) host_tree = host_trav.tree if who_has_extent: - pre_merge_host_trav = pre_merge_trav.get(queue=actx.queue) + pre_merge_host_trav = actx.to_numpy(pre_merge_trav) from boxtree.tree import ParticleListFilter - plfilt = ParticleListFilter(actx.context) + plfilt = ParticleListFilter(actx) tree_indep = ConstantOneTreeIndependentDataForWrangler() @@ -252,16 +252,16 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, if filter_kind == "user": filtered_targets = plfilt.filter_target_lists_in_user_order( - actx.queue, tree, flags) + actx, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder( tree_indep, host_trav, - filtered_targets.get(queue=actx.queue)) + actx.to_numpy(filtered_targets)) elif filter_kind == "tree": filtered_targets = plfilt.filter_target_lists_in_tree_order( - actx.queue, tree, flags) + actx, tree, flags) wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder( tree_indep, host_trav, - filtered_targets.get(queue=actx.queue)) + actx.to_numpy(filtered_targets)) else: raise ValueError("unsupported value of 'filter_kind'") else: @@ -402,25 +402,25 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k): ntargets = 1000 dtype = np.float64 - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) targets = ( - p_normal(actx.queue, ntargets, dims, dtype, seed=18) + p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) - sources_host = particle_array_to_host(sources) - targets_host = particle_array_to_host(targets) + sources_host = particle_array_to_host(actx, sources) + targets_host = particle_array_to_host(actx, targets) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) - trav = trav.get(queue=actx.queue) + trav = actx.to_numpy(trav) rng = np.random.default_rng(20) weights = rng.uniform(0.0, 1.0, (nsources,)) @@ -511,9 +511,13 @@ def fmm_level_to_order(tree, lev): [knl], exclude_self=False) - _evt, (sumpy_ref_pot,) = p2p( - actx.queue, targets, sources, (weights,), - out_host=True, **sumpy_extra_kwargs) + result, = p2p( + actx, + targets, + sources, + (actx.from_numpy(weights),), + **sumpy_extra_kwargs) + sumpy_ref_pot = actx.to_numpy(result) sumpy_rel_err = ( la.norm(pot - sumpy_ref_pot, np.inf) @@ -554,18 +558,18 @@ def test_pyfmmlib_numerical_stability(actx_factory, dims, helmholtz_k, order): targets = sources * (1 + 1e-3) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=2, debug=True) assert tree.nlevels >= 15 from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) - trav = trav.get(queue=actx.queue) + trav = actx.to_numpy(trav) weights = np.ones_like(sources[0]) from boxtree.pyfmmlib_integration import ( @@ -585,7 +589,7 @@ def fmm_level_to_order(tree, lev): tree_indep, trav, helmholtz_k=helmholtz_k, fmm_level_to_order=fmm_level_to_order, - rotation_data=FMMLibRotationData(actx.queue, trav)) + rotation_data=FMMLibRotationData(actx, trav)) from boxtree.fmm import drive_fmm pot = drive_fmm(wrangler, (weights,)) @@ -625,8 +629,8 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten from_sep_smaller_min_nsources_cumul = 1 + max_particles_in_box from boxtree.fmm import drive_fmm - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=15) rng = np.random.default_rng(22) if enable_extents: @@ -637,22 +641,22 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten target_radii = None from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=max_particles_in_box, target_radii=target_radii, debug=True, stick_out_factor=0.25) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True, + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True, _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) @@ -680,8 +684,8 @@ def test_fmm_float32(actx_factory, enable_extents): dtype = np.float32 from boxtree.fmm import drive_fmm - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=15) rng = np.random.default_rng(12) if enable_extents: @@ -692,21 +696,21 @@ def test_fmm_float32(actx_factory, enable_extents): target_radii = None from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, target_radii=target_radii, debug=True, stick_out_factor=0.25) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) weights = np.ones(nsources) weights_sum = np.sum(weights) - host_trav = trav.get(queue=actx.queue) + host_trav = actx.to_numpy(trav) tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) @@ -732,21 +736,21 @@ def test_fmm_with_optimized_3d_m2l(actx_factory, nsrcntgts, helmholtz_k, nsources = ntargets = nsrcntgts // 2 dtype = np.float64 - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) + sources = p_normal(actx, nsources, dims, dtype, seed=15) targets = ( - p_normal(actx.queue, ntargets, dims, dtype, seed=18) + p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, targets=targets, + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) - trav = trav.get(queue=actx.queue) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) + trav = actx.to_numpy(trav) rng = np.random.default_rng(20) weights = rng.uniform(0.0, 1.0, (nsources,)) @@ -781,7 +785,7 @@ def fmm_level_to_order(tree, lev): tree_indep, trav, helmholtz_k=helmholtz_k, fmm_level_to_order=fmm_level_to_order, - rotation_data=FMMLibRotationData(actx.queue, trav)) + rotation_data=FMMLibRotationData(actx, trav)) from boxtree.fmm import drive_fmm From 994a3596e2078a161f0e335ba9943d8a92200304 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Thu, 23 Jun 2022 21:58:27 +0300 Subject: [PATCH 19/28] port test_cost_model to arraycontext --- test/test_cost_model.py | 101 ++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/test/test_cost_model.py b/test/test_cost_model.py index cd1fb95b..7c382a1a 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -32,10 +32,7 @@ import pytest from arraycontext import pytest_generate_tests_for_array_contexts -from boxtree.array_context import ( - PytestPyOpenCLArrayContextFactory, - _acf, # noqa: F401 -) +from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf # noqa: F401 from boxtree.cost import ( FMMCostModel, _PythonFMMCostModel, @@ -64,8 +61,8 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(22) target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype) @@ -75,16 +72,16 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt # {{{ Generate tree and traversal from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2) + trav_dev, _ = tg(actx, tree, debug=True) + trav = actx.to_numpy(trav_dev) # }}} @@ -112,12 +109,12 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt context=constant_one_params ) p2m_cost_dev = actx.from_numpy(p2m_cost) - actx.queue.finish() + start_time = time.time() cl_form_multipoles = cl_cost_model.process_form_multipoles( - actx.queue, trav_dev, p2m_cost_dev + actx, trav_dev, p2m_cost_dev ) actx.queue.finish() @@ -127,7 +124,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_form_multipoles = python_cost_model.process_form_multipoles( - actx.queue, trav, p2m_cost + actx, trav, p2m_cost ) logger.info("Python time for process_form_multipoles: %gs", @@ -150,7 +147,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles( - actx.queue, trav_dev, m2m_cost_dev + actx, trav_dev, m2m_cost_dev ) actx.queue.finish() @@ -160,7 +157,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles( - actx.queue, trav, m2m_cost + actx, trav, m2m_cost ) logger.info("Python time for coarsen_multipoles: %gs", @@ -176,10 +173,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_ndirect_sources_per_target_box = \ - cl_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav_dev) + cl_cost_model.get_ndirect_sources_per_target_box(actx, trav_dev) cl_direct = cl_cost_model.process_direct( - actx.queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0 + actx, trav_dev, cl_ndirect_sources_per_target_box, 5.0 ) actx.queue.finish() @@ -189,10 +186,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_ndirect_sources_per_target_box = \ - python_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav) + python_cost_model.get_ndirect_sources_per_target_box(actx, trav) python_direct = python_cost_model.process_direct( - actx.queue, trav, python_ndirect_sources_per_target_box, 5.0 + actx, trav, python_ndirect_sources_per_target_box, 5.0 ) logger.info("Python time for process_direct: %gs", @@ -206,7 +203,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() - cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct) + cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(actx, cl_direct) actx.queue.finish() logger.info("OpenCL time for aggregate_over_boxes: %gs", @@ -214,7 +211,9 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() - python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct) + python_direct_aggregate = ( + python_cost_model.aggregate_over_boxes(actx, python_direct) + ) logger.info("Python time for aggregate_over_boxes: %gs", time.time() - start_time) @@ -237,14 +236,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_m2l_cost = cl_cost_model.process_list2(actx.queue, trav_dev, m2l_cost_dev) + cl_m2l_cost = cl_cost_model.process_list2(actx, trav_dev, m2l_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list2: %gs", time.time() - start_time) start_time = time.time() - python_m2l_cost = python_cost_model.process_list2(actx.queue, trav, m2l_cost) + python_m2l_cost = python_cost_model.process_list2(actx, trav, m2l_cost) logger.info("Python time for process_list2: %gs", time.time() - start_time) @@ -265,14 +264,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_m2p_cost = cl_cost_model.process_list3(actx.queue, trav_dev, m2p_cost_dev) + cl_m2p_cost = cl_cost_model.process_list3(actx, trav_dev, m2p_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list3: %gs", time.time() - start_time) start_time = time.time() - python_m2p_cost = python_cost_model.process_list3(actx.queue, trav, m2p_cost) + python_m2p_cost = python_cost_model.process_list3(actx, trav, m2p_cost) logger.info("Python time for process_list3: %gs", time.time() - start_time) @@ -293,14 +292,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt actx.queue.finish() start_time = time.time() - cl_p2l_cost = cl_cost_model.process_list4(actx.queue, trav_dev, p2l_cost_dev) + cl_p2l_cost = cl_cost_model.process_list4(actx, trav_dev, p2l_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_list4: %gs", time.time() - start_time) start_time = time.time() - python_p2l_cost = python_cost_model.process_list4(actx.queue, trav, p2l_cost) + python_p2l_cost = python_cost_model.process_list4(actx, trav, p2l_cost) logger.info("Python time for process_list4: %gs", time.time() - start_time) @@ -322,7 +321,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_refine_locals_cost = cl_cost_model.process_refine_locals( - actx.queue, trav_dev, l2l_cost_dev + actx, trav_dev, l2l_cost_dev ) actx.queue.finish() @@ -331,7 +330,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_refine_locals_cost = python_cost_model.process_refine_locals( - actx.queue, trav, l2l_cost + actx, trav, l2l_cost ) logger.info("Python time for refine_locals: %gs", time.time() - start_time) @@ -354,7 +353,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() cl_l2p_cost = cl_cost_model.process_eval_locals( - actx.queue, trav_dev, l2p_cost_dev) + actx, trav_dev, l2p_cost_dev) actx.queue.finish() logger.info("OpenCL time for process_eval_locals: %gs", @@ -362,7 +361,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt start_time = time.time() python_l2p_cost = python_cost_model.process_eval_locals( - actx.queue, trav, l2p_cost) + actx, trav, l2p_cost) logger.info("Python time for process_eval_locals: %gs", time.time() - start_time) @@ -404,8 +403,8 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(22) target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype) @@ -415,16 +414,16 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate tree and traversal from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2) + trav_dev, _ = tg(actx, tree, debug=True) + trav = actx.to_numpy(trav_dev) traversals.append(trav) traversals_dev.append(trav_dev) @@ -467,7 +466,7 @@ def test_params_equal(test_params1, test_params2): level_to_order = level_to_orders[icase] python_model_results.append(python_cost_model.cost_per_stage( - actx.queue, traversal, level_to_order, + actx, traversal, level_to_order, _PythonFMMCostModel.get_unit_calibration_params(), )) @@ -486,7 +485,7 @@ def test_params_equal(test_params1, test_params2): level_to_order = level_to_orders[icase] cl_model_results.append(cl_cost_model.cost_per_stage( - actx.queue, traversal, level_to_order, + actx, traversal, level_to_order, FMMCostModel.get_unit_calibration_params(), )) @@ -539,23 +538,23 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( actx = actx_factory() from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=16) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=19) + sources = p_normal(actx, nsources, dims, dtype, seed=16) + targets = p_normal(actx, ntargets, dims, dtype, seed=19) rng = np.random.default_rng(20) target_radii = rng.uniform(0, 0.04, (ntargets,)).astype(dtype) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) tree, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) - trav_dev, _ = tg(actx.queue, tree, debug=True) - trav = trav_dev.get(queue=actx.queue) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2) + trav_dev, _ = tg(actx, tree, debug=True) + trav = actx.to_numpy(trav_dev) from boxtree.constant_one import ( ConstantOneExpansionWrangler, @@ -576,7 +575,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( level_to_order = np.array([1 for _ in range(tree.nlevels)]) modeled_time = cost_model.cost_per_stage( - actx.queue, trav_dev, level_to_order, + actx, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) @@ -595,10 +594,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( total_cost += timing_data[stage]["ops_elapsed"] per_box_cost = cost_model.cost_per_box( - actx.queue, trav_dev, level_to_order, + actx, trav_dev, level_to_order, FMMCostModel.get_unit_calibration_params(), ) - total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost) + total_aggregate_cost = cost_model.aggregate_over_boxes(actx, per_box_cost) assert total_cost == ( total_aggregate_cost From 53e7c28b9c3a0c5299ae373afda3a0a1f1e6753c Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sun, 26 Jun 2022 20:00:45 +0300 Subject: [PATCH 20/28] port test_distributed to arraycontext --- test/test_distributed.py | 70 ++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/test/test_distributed.py b/test/test_distributed.py index ce975926..41e5ac6e 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -39,6 +39,7 @@ ) from boxtree.pyfmmlib_integration import ( FMMLibExpansionWrangler, + FMMLibRotationData, FMMLibTreeIndependentDataForWrangler, Kernel, ) @@ -84,7 +85,7 @@ def fmm_level_to_order(tree, level): actx = _acf() from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2) tree_indep = FMMLibTreeIndependentDataForWrangler( dims, Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE) @@ -93,8 +94,8 @@ def fmm_level_to_order(tree, level): if rank == 0: # Generate random particles and source weights from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) rng = np.random.default_rng(20) sources_weights = rng.uniform(0.0, 1.0, (nsources,)) @@ -102,19 +103,20 @@ def fmm_level_to_order(tree, level): # Build the tree and interaction lists from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) global_tree_dev, _ = tb( - actx.queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.25, max_particles_in_box=30, debug=True) - d_trav, _ = tg(actx.queue, global_tree_dev, debug=True) - global_traversal_host = d_trav.get(queue=actx.queue) + d_trav, _ = tg(actx, global_tree_dev, debug=True) + global_traversal_host = actx.to_numpy(d_trav) global_tree_host = global_traversal_host.tree # Get pyfmmlib expansion wrangler wrangler = FMMLibExpansionWrangler( tree_indep, global_traversal_host, - fmm_level_to_order=fmm_level_to_order) + fmm_level_to_order=fmm_level_to_order, + rotation_data=FMMLibRotationData(actx, global_traversal_host)) # Compute FMM with one MPI rank from boxtree.fmm import drive_fmm @@ -128,13 +130,13 @@ def wrangler_factory(local_traversal, global_traversal): ) return DistributedFMMLibExpansionWrangler( - actx.context, comm, tree_indep, local_traversal, global_traversal, + comm, tree_indep, local_traversal, global_traversal, fmm_level_to_order=fmm_level_to_order, communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) from boxtree.distributed import DistributedFMMRunner distributed_fmm_info = DistributedFMMRunner( - actx.queue, global_tree_host, tg, wrangler_factory, comm=comm) + actx, global_tree_host, tg, wrangler_factory, comm=comm) timing_data = {} pot_dfmm = distributed_fmm_info.drive_dfmm( @@ -188,31 +190,42 @@ def test_against_shared( # {{{ test_constantone def _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype): - from boxtree.distributed.calculation import DistributedExpansionWrangler + from boxtree.distributed.calculation import DistributedExpansionWranglerMixin class ConstantOneExpansionWrangler( - ConstantOneExpansionWranglerBase, DistributedExpansionWrangler): + DistributedExpansionWranglerMixin, + ConstantOneExpansionWranglerBase): def __init__( - self, queue, comm, tree_indep, local_traversal, global_traversal): - DistributedExpansionWrangler.__init__( - self, queue, comm, global_traversal, False, - communicate_mpoles_via_allreduce=True) + self, array_context, comm, + tree_indep, local_traversal, global_traversal): ConstantOneExpansionWranglerBase.__init__( self, tree_indep, local_traversal) + + self._setup_actx = array_context + self.comm = comm + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = True + self.level_orders = np.ones(local_traversal.tree.nlevels, dtype=np.int32) def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return source_array[self.global_traversal.tree.user_source_ids] else: return None def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: return potentials[self.global_traversal.tree.sorted_target_ids] else: return None + def finalize_potentials(self, potentials, template_ary): + if self.is_mpi_root: + return super().finalize_potentials(potentials, template_ary) + else: + return None + from mpi4py import MPI # Get the current rank @@ -229,14 +242,14 @@ def reorder_potentials(self, potentials): actx = _acf() from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(actx.context) + tg = FMMTraversalBuilder(actx) if rank == 0: # Generate random particles from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, dims, dtype, seed=15) - targets = (p_normal(actx.queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = (p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) # Constant one source weights @@ -244,21 +257,20 @@ def reorder_potentials(self, potentials): # Build the global tree from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - tree, _ = tb( - actx.queue, sources, targets=targets, max_particles_in_box=30, - debug=True) - tree = tree.get(actx.queue) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, + debug=True) + tree = actx.to_numpy(tree) tree_indep = ConstantOneTreeIndependentDataForWrangler() def wrangler_factory(local_traversal, global_traversal): return ConstantOneExpansionWrangler( - actx.queue, comm, tree_indep, local_traversal, global_traversal) + actx, comm, tree_indep, local_traversal, global_traversal) from boxtree.distributed import DistributedFMMRunner distributed_fmm_info = DistributedFMMRunner( - actx.queue, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD) + actx, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD) pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights]) @@ -322,3 +334,5 @@ def test_constantone(tmp_path, num_processes, dims, nsources, ntargets): ntargets = 10000 _test_against_shared(tmp_cache_basedir, dims, nsources, ntargets, dtype) + +# vim: fdm=marker From f50c1d90f14ed8a6abd6078d3585188fc64c329e Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sun, 26 Jun 2022 20:00:26 +0300 Subject: [PATCH 21/28] port examples to arraycontext --- examples/cost_model.py | 44 ++++++++++++++---------------------------- examples/demo.py | 31 +++++++++++++++-------------- 2 files changed, 32 insertions(+), 43 deletions(-) diff --git a/examples/cost_model.py b/examples/cost_model.py index 60943210..7b783891 100644 --- a/examples/cost_model.py +++ b/examples/cost_model.py @@ -1,31 +1,18 @@ import logging import os -import sys import numpy as np import pyopencl as cl -# Configure the root logger logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING")) - logger = logging.getLogger(__name__) - -# Set the logger level of this module to INFO so that logging outputs of this module -# are shown logger.setLevel(logging.INFO) -# `process_elapsed` in `ProcessTimer` is only supported for Python >= 3.3 -SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3)) - def demo_cost_model(): - if not SUPPORTS_PROCESS_TIME: - raise NotImplementedError( - "Currently this script uses process time which only works on Python>=3.3" - ) - + from boxtree.array_context import PyOpenCLArrayContext from boxtree.pyfmmlib_integration import ( FMMLibExpansionWrangler, FMMLibTreeIndependentDataForWrangler, @@ -40,6 +27,7 @@ def demo_cost_model(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) + actx = PyOpenCLArrayContext(queue, force_device_scalars=True) traversals = [] traversals_dev = [] @@ -53,31 +41,29 @@ def fmm_level_to_order(tree, ilevel): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(queue, nsources, dims, dtype, seed=15) - targets = p_normal(queue, ntargets, dims, dtype, seed=18) - - from pyopencl.clrandom import PhiloxGenerator + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) - clrng = PhiloxGenerator(queue.context, seed=22) - target_radii = clrng.uniform( - queue, ntargets, a=0, b=0.05, dtype=dtype - ).get() + rng = np.random.default_rng(seed=22) + target_radii = actx.from_numpy( + rng.uniform(low=0.0, high=0.05, size=ntargets) + ) # }}} # {{{ Generate tree and traversal from boxtree import TreeBuilder - tb = TreeBuilder(ctx) + tb = TreeBuilder(actx) tree, _ = tb( - queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder - tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) - trav_dev, _ = tg(queue, tree, debug=True) - trav = trav_dev.get(queue=queue) + tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2) + trav_dev, _ = tg(actx, tree, debug=True) + trav = actx.to_numpy(trav_dev) traversals.append(trav) traversals_dev.append(trav_dev) @@ -107,7 +93,7 @@ def fmm_level_to_order(tree, ilevel): traversal = traversals_dev[icase] model_results.append( cost_model.cost_per_stage( - queue, traversal, level_orders_list[icase], + actx, traversal, level_orders_list[icase], FMMCostModel.get_unit_calibration_params(), ) ) @@ -118,7 +104,7 @@ def fmm_level_to_order(tree, ilevel): ) predicted_time = cost_model.cost_per_stage( - queue, traversals_dev[-1], level_orders_list[-1], params, + actx, traversals_dev[-1], level_orders_list[-1], params, ) queue.finish() diff --git a/examples/demo.py b/examples/demo.py index cb87bef7..f3ce1715 100644 --- a/examples/demo.py +++ b/examples/demo.py @@ -8,8 +8,12 @@ logging.basicConfig(level="INFO") +from boxtree.array_context import PyOpenCLArrayContext + + ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) +actx = PyOpenCLArrayContext(queue, force_device_scalars=True) dims = 2 nparticles = 500 @@ -17,16 +21,13 @@ # ----------------------------------------------------------------------------- # generate some random particle positions # ----------------------------------------------------------------------------- -from pyopencl.clrandom import PhiloxGenerator - - -rng = PhiloxGenerator(ctx, seed=15) - from pytools.obj_array import make_obj_array +rng = np.random.default_rng(seed=15) + particles = make_obj_array([ - rng.normal(queue, nparticles, dtype=np.float64) + actx.from_numpy(rng.normal(size=nparticles)) for i in range(dims)]) # ----------------------------------------------------------------------------- @@ -35,14 +36,14 @@ from boxtree import TreeBuilder -tb = TreeBuilder(ctx) -tree, _ = tb(queue, particles, max_particles_in_box=5) +tb = TreeBuilder(actx) +tree, _ = tb(actx, particles, max_particles_in_box=5) from boxtree.traversal import FMMTraversalBuilder -tg = FMMTraversalBuilder(ctx) -trav, _ = tg(queue, tree) +tg = FMMTraversalBuilder(actx) +trav, _ = tg(actx, tree) # ENDEXAMPLE @@ -50,15 +51,17 @@ # plot the tree # ----------------------------------------------------------------------------- -import matplotlib.pyplot as pt - +particles = actx.to_numpy(particles) +tree = actx.to_numpy(tree) -pt.plot(particles[0].get(), particles[1].get(), "+") +import matplotlib.pyplot as pt from boxtree.visualization import TreePlotter -plotter = TreePlotter(tree.get(queue=queue)) +pt.plot(particles[0], particles[1], "+") +plotter = TreePlotter(tree) + plotter.draw_tree(fill=False, edgecolor="black") # plotter.draw_box_numbers() plotter.set_bounding_box() From 8efb304b70b1706c89f133ef1516b46028301777 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Sun, 26 Jun 2022 20:18:43 +0300 Subject: [PATCH 22/28] remove ImmutableHostDeviceArray --- boxtree/tools.py | 92 ++++-------------------------------------------- 1 file changed, 6 insertions(+), 86 deletions(-) diff --git a/boxtree/tools.py b/boxtree/tools.py index ab4240c6..8c72275f 100644 --- a/boxtree/tools.py +++ b/boxtree/tools.py @@ -127,7 +127,7 @@ def get_2d_knl(dtype): knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") - return knl.executor(queue.context) + return knl.executor(actx.context) _evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles) @@ -161,7 +161,7 @@ def get_3d_knl(dtype): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - return knl.executor(queue.context) + return knl.executor(actx.context) _evt, result = get_3d_knl(dtype)(actx.queue, n=n) @@ -204,7 +204,7 @@ def get_2d_knl(dtype): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") - return knl.executor(queue.context) + return knl.executor(actx.context) _evt, result = get_2d_knl(dtype)(actx.queue, n=n) @@ -252,7 +252,7 @@ def get_3d_knl(dtype): knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0") - return knl.executor(queue.context) + return knl.executor(actx.context) _evt, result = get_3d_knl(dtype)(actx.queue, n=n) @@ -326,9 +326,8 @@ def transform_val(val): def get(self, queue, **kwargs): """ :returns: a copy of *self* in which all data lives on the host, i.e. - all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` - objects are replaced by corresponding :class:`numpy.ndarray` - instances on the host. + all :class:`pyopencl.array.Array` objects are replaced by + corresponding :class:`numpy.ndarray` instances on the host. """ from warnings import warn warn(f"{type(self).__name__}.get is deprecated and will be removed " @@ -336,9 +335,6 @@ def get(self, queue, **kwargs): DeprecationWarning, stacklevel=2) def try_get(attr): - if isinstance(attr, ImmutableHostDeviceArray): - return attr.host - try: return attr.get(queue=queue, **kwargs) except AttributeError: @@ -383,8 +379,6 @@ def to_device(self, queue, exclude_fields=frozenset()): def _to_device(attr): if isinstance(attr, np.ndarray): return cl.array.to_device(queue, attr).with_queue(None) - elif isinstance(attr, ImmutableHostDeviceArray): - return attr.device elif isinstance(attr, DeviceDataRecord): return attr.to_device(queue) else: @@ -392,31 +386,6 @@ def _to_device(attr): return self._transform_arrays(_to_device, exclude_fields=exclude_fields) - def to_host_device_array(self, queue, exclude_fields=frozenset()): - """ - :arg exclude_fields: a :class:`frozenset` containing fields excluded - from transformed to `ImmutableHostDeviceArray`. - - :returns: a copy of *self* where all device and host arrays are - transformed to `ImmutableHostDeviceArray` objects. - """ - from warnings import warn - warn(f"{type(self).__name__}.to_host_device_array is deprecated and will " - "be removed in 2025. Switch from ImmutableHostDeviceArray.", - DeprecationWarning, stacklevel=2) - - def _to_host_device_array(attr): - if isinstance(attr, np.ndarray | cl.array.Array): - return ImmutableHostDeviceArray(queue, attr) - elif isinstance(attr, DeviceDataRecord): - return attr.to_host_device_array(queue) - else: - return attr - - return self._transform_arrays( - _to_host_device_array, exclude_fields=exclude_fields - ) - # }}} @@ -910,55 +879,6 @@ def run_mpi(script: str, num_processes: int, env: dict[str, Any]) -> None: # }}} -# {{{ HostDeviceArray - -class ImmutableHostDeviceArray: - """Interface for arrays on both host and device. - - .. note:: This interface assumes the array is immutable. The behavior of - modifying the content of either the host array or the device array is undefined. - - @TODO: Once available, replace this implementation with PyOpenCL's in-house - implementation. - """ - def __init__(self, queue, array): - self.queue = queue - self.shape = array.shape - self.host_array = None - self.device_array = None - - if isinstance(array, np.ndarray): - self.host_array = array - elif isinstance(array, cl.array.Array): - self.device_array = array - - def with_queue(self, queue): - self.queue = queue - - @property - def svm_capable(self): - svm_capabilities = \ - self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES) - return svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0 - - @property - def host(self): - if self.host_array is None: - self.host_array = self.device_array.get(self.queue) - return self.host_array - - @property - def device(self): - if self.device_array is None: - # @TODO: Use SVM - self.device_array = cl.array.to_device(self.queue, self.host_array) - - self.device_array.with_queue(self.queue) - return self.device_array - -# }}} - - # {{{ coord_vec tools def get_coord_vec_dtype( From 0aee29841ae6512dfd2eea8228db78c60d30d524 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Fri, 24 Jun 2022 19:36:54 +0300 Subject: [PATCH 23/28] docs: add arraycontext --- .pylintrc-local.yml | 4 -- boxtree/__init__.py | 8 +-- boxtree/tree.py | 4 -- doc/Makefile | 136 +++++--------------------------------------- doc/tools.rst | 2 + 5 files changed, 19 insertions(+), 135 deletions(-) diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml index 745ae717..e6cde7f0 100644 --- a/.pylintrc-local.yml +++ b/.pylintrc-local.yml @@ -3,7 +3,3 @@ - arg: extension-pkg-whitelist val: pyfmmlib - -# Needed for boxtree.tools -- arg: init-hook - val: import sys; sys.setrecursionlimit(2000) diff --git a/boxtree/__init__.py b/boxtree/__init__.py index 6bfda26d..c8928c79 100644 --- a/boxtree/__init__.py +++ b/boxtree/__init__.py @@ -140,15 +140,15 @@ two arrays, one whose name ends in ``_starts``, and another whose name ends in ``_lists``. For example, suppose we would like to find the colleagues of box #17 using -:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_starts` +:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_starts` and -:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_lists`. +:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_lists`. The following snippet of code achieves this:: ibox = 17 - start, end = colleagues_starts[ibox:ibox+2] - ibox_colleagues = colleagues_lists[start:end] + start, end = same_level_non_well_sep_boxes_starts[ibox:ibox+2] + ibox_colleagues = same_level_non_well_sep_boxes_lists[start:end] This indexing scheme has the following properties: diff --git a/boxtree/tree.py b/boxtree/tree.py index 857c04cc..f4a97d1f 100644 --- a/boxtree/tree.py +++ b/boxtree/tree.py @@ -52,10 +52,6 @@ ^^^^^ .. autoclass:: ParticleListFilter - -.. autofunction:: filter_target_lists_in_user_order - -.. autofunction:: filter_target_lists_in_tree_order """ __copyright__ = "Copyright (C) 2013 Andreas Kloeckner" diff --git a/doc/Makefile b/doc/Makefile index bb66c0e8..d0ac5f2f 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,130 +1,20 @@ -# Makefile for Sphinx documentation +# Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -n -SPHINXBUILD = python $(shell which sphinx-build) -PAPER = +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= python $(shell which sphinx-build) +SOURCEDIR = . BUILDDIR = _build -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest - +# Put it first so that "make" without argument is like "make help". help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - -rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/boxtree.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/boxtree.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/boxtree" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/boxtree" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - make -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." +.PHONY: help Makefile -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/tools.rst b/doc/tools.rst index 6db9bc70..0b5225ee 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -4,3 +4,5 @@ Utility Functionality .. automodule:: boxtree.timing .. automodule:: boxtree.constant_one + +.. automodule:: boxtree.array_context From 4ca76b6ed513fabe7d6e46241390394d8a0955ea Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Wed, 7 Sep 2022 20:47:27 +0300 Subject: [PATCH 24/28] update fmm interface for sumpy --- boxtree/constant_one.py | 34 ++++++++++++++----- boxtree/distributed/__init__.py | 9 +++-- boxtree/distributed/calculation.py | 10 +++--- boxtree/fmm.py | 54 +++++++++++++++++++++++------- boxtree/pyfmmlib_integration.py | 42 +++++++++++++++-------- boxtree/traversal.py | 5 ++- examples/cost_model.py | 2 +- test/test_cost_model.py | 4 +-- test/test_distributed.py | 6 ++-- test/test_fmm.py | 39 +++++++++++---------- 10 files changed, 134 insertions(+), 71 deletions(-) diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py index 674efca8..11acf0b4 100644 --- a/boxtree/constant_one.py +++ b/boxtree/constant_one.py @@ -27,6 +27,7 @@ import numpy as np +from boxtree.array_context import PyOpenCLArrayContext from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler from boxtree.timing import DummyTimingFuture @@ -85,7 +86,9 @@ def local_expansions_view(self, local_exps, level): def timing_future(ops): return DummyTimingFuture.from_op_count(ops) - def form_multipoles(self, level_start_source_box_nrs, source_boxes, + def form_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_box_nrs, + source_boxes, src_weight_vecs): src_weights, = src_weight_vecs mpoles = self.multipole_expansion_zeros() @@ -98,8 +101,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes, return mpoles, self.timing_future(ops) - def coarsen_multipoles(self, level_start_source_parent_box_nrs, - source_parent_boxes, mpoles): + def coarsen_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_parent_box_nrs, + source_parent_boxes, + mpoles): tree = self.tree ops = 0 @@ -121,7 +126,8 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs, return mpoles, self.timing_future(ops) - def eval_direct(self, target_boxes, neighbor_sources_starts, + def eval_direct(self, actx: PyOpenCLArrayContext, + target_boxes, neighbor_sources_starts, neighbor_sources_lists, src_weight_vecs): src_weights, = src_weight_vecs pot = self.output_zeros() @@ -146,6 +152,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts, return pot, self.timing_future(ops) def multipole_to_local(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -166,7 +173,9 @@ def multipole_to_local(self, return local_exps, self.timing_future(ops) def eval_multipoles(self, - target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level, + actx: PyOpenCLArrayContext, + target_boxes_by_source_level, + from_sep_smaller_nonsiblings_by_level, mpole_exps): pot = self.output_zeros() ops = 0 @@ -188,8 +197,10 @@ def eval_multipoles(self, return pot, self.timing_future(ops) def form_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, starts, lists, src_weight_vecs): + target_or_target_parent_boxes, + starts, lists, src_weight_vecs): src_weights, = src_weight_vecs local_exps = self.local_expansion_zeros() ops = 0 @@ -211,7 +222,9 @@ def form_locals(self, return local_exps, self.timing_future(ops) - def refine_locals(self, level_start_target_or_target_parent_box_nrs, + def refine_locals(self, + actx: PyOpenCLArrayContext, + level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): ops = 0 @@ -224,7 +237,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs, return local_exps, self.timing_future(ops) - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): + def eval_locals(self, + actx: PyOpenCLArrayContext, + level_start_target_box_nrs, + target_boxes, local_exps): pot = self.output_zeros() ops = 0 @@ -235,7 +251,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): return pot, self.timing_future(ops) - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): return potentials # }}} diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py index 73881352..102a2531 100644 --- a/boxtree/distributed/__init__.py +++ b/boxtree/distributed/__init__.py @@ -293,11 +293,14 @@ def __init__(self, array_context: PyOpenCLArrayContext, global_tree, array_context, global_tree, traversal_builder, wrangler_factory, calibration_params, comm) - def drive_dfmm(self, source_weights, timing_data=None): - """Calculate potentials at target points. - """ + def drive_dfmm(self, + actx: PyOpenCLArrayContext, + source_weights, + timing_data=None): + """Calculate potentials at target points.""" from boxtree.fmm import drive_fmm return drive_fmm( + actx, self.wrangler, source_weights, timing_data=timing_data, global_src_idx_all_ranks=self.src_idx_all_ranks, diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py index 22ad296d..d85cc6dc 100644 --- a/boxtree/distributed/calculation.py +++ b/boxtree/distributed/calculation.py @@ -72,7 +72,8 @@ def mpi_size(self): def is_mpi_root(self): return self.mpi_rank == 0 - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): + def distribute_source_weights(self, + actx: PyOpenCLArrayContext, src_weight_vecs, src_idx_all_ranks): if self.is_mpi_root: distribute_weight_req = [] local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object) @@ -95,7 +96,8 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): return local_src_weight_vecs - def gather_potential_results(self, potentials, tgt_idx_all_ranks): + def gather_potential_results(self, + actx: PyOpenCLArrayContext, potentials, tgt_idx_all_ranks): from boxtree.distributed import dtype_to_mpi potentials_mpi_type = dtype_to_mpi(potentials.dtype) gathered_potentials = None @@ -256,8 +258,8 @@ def find_boxes_used_by_subrange( return box_in_subrange - def communicate_mpoles(self, actx: PyOpenCLArrayContext, - mpole_exps, return_stats=False): + def communicate_mpoles(self, + actx: PyOpenCLArrayContext, mpole_exps, return_stats=False): """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_. The main idea is to mimic an allreduce as done on a hypercube network, but to diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 6c13290c..760e67bd 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -32,6 +32,7 @@ from pytools import ProcessLogger +from boxtree.array_context import PyOpenCLArrayContext from boxtree.traversal import FMMTraversalInfo from boxtree.tree import Tree @@ -156,6 +157,7 @@ def local_expansions_view(self, local_exps, level): @abstractmethod def form_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): """Return an expansions array @@ -168,6 +170,7 @@ def form_multipoles(self, @abstractmethod def coarsen_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): """For each box in *source_parent_boxes*, @@ -180,6 +183,7 @@ def coarsen_multipoles(self, @abstractmethod def eval_direct(self, + actx: PyOpenCLArrayContext, target_boxes, neighbor_sources_starts, neighbor_sources_lists, src_weight_vecs): """For each box in *target_boxes*, evaluate the influence of the @@ -192,6 +196,7 @@ def eval_direct(self, @abstractmethod def multipole_to_local(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -206,6 +211,7 @@ def multipole_to_local(self, @abstractmethod def eval_multipoles(self, + actx: PyOpenCLArrayContext, target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps): """For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate the multipole expansion in *mpole_exps* in the nearby boxes given in @@ -219,6 +225,7 @@ def eval_multipoles(self, @abstractmethod def form_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, src_weight_vecs): """For each box in *target_or_target_parent_boxes*, form local @@ -233,6 +240,7 @@ def form_locals(self, @abstractmethod def refine_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): """For each box in *child_boxes*, @@ -244,6 +252,7 @@ def refine_locals(self, @abstractmethod def eval_locals(self, + actx: PyOpenCLArrayContext, level_start_target_box_nrs, target_boxes, local_exps): """For each box in *target_boxes*, evaluate the local expansion in *local_exps* and return a new potential array. @@ -255,7 +264,7 @@ def eval_locals(self, # }}} @abstractmethod - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): """ Postprocess the reordered potentials. This is where global scaling factors could be applied. This is distinct from :meth:`reorder_potentials` @@ -269,7 +278,9 @@ def finalize_potentials(self, potentials, template_ary): type. """ - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): + def distribute_source_weights(self, + actx: PyOpenCLArrayContext, + src_weight_vecs, src_idx_all_ranks): """Used by the distributed implementation for transferring needed source weights from root rank to each worker rank in the communicator. @@ -289,7 +300,9 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): """ return src_weight_vecs - def gather_potential_results(self, potentials, tgt_idx_all_ranks): + def gather_potential_results(self, + actx: PyOpenCLArrayContext, + potentials, tgt_idx_all_ranks): """Used by the distributed implementation for gathering calculated potentials from all worker ranks in the communicator to the root rank. @@ -306,7 +319,9 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks): """ return potentials - def communicate_mpoles(self, mpole_exps, return_stats=False): # noqa: B027 + def communicate_mpoles(self, # noqa: B027 + actx: PyOpenCLArrayContext, + mpole_exps, return_stats=False): """Used by the distributed implementation for forming the complete multipole expansions from the partial multipole expansions. @@ -325,9 +340,12 @@ def communicate_mpoles(self, mpole_exps, return_stats=False): # noqa: B027 # }}} -def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, +def drive_fmm(actx: PyOpenCLArrayContext, + wrangler: ExpansionWranglerInterface, + src_weight_vecs, *, timing_data=None, - global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None): + global_src_idx_all_ranks=None, + global_tgt_idx_all_ranks=None): """Top-level driver routine for a fast multipole calculation. In part, this is intended as a template for custom FMMs, in the sense that @@ -374,15 +392,17 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, fmm_proc = ProcessLogger(logger, "fmm") recorder = TimingRecorder() - src_weight_vecs = [wrangler.reorder_sources(weight) for - weight in src_weight_vecs] + src_weight_vecs = [ + wrangler.reorder_sources(weight) for weight in src_weight_vecs] src_weight_vecs = wrangler.distribute_source_weights( - src_weight_vecs, global_src_idx_all_ranks) + actx, + src_weight_vecs, global_src_idx_all_ranks) # {{{ "Step 2.1:" Construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( + actx, traversal.level_start_source_box_nrs, traversal.source_boxes, src_weight_vecs) @@ -394,6 +414,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Step 2.2:" Propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( + actx, traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) @@ -404,11 +425,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # }}} - wrangler.communicate_mpoles(mpole_exps) + wrangler.communicate_mpoles(actx, mpole_exps) # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1") potentials, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, @@ -423,6 +445,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, @@ -441,6 +464,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( + actx, traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) @@ -456,6 +480,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, "('list 3 close')") direct_result, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, @@ -470,6 +495,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, @@ -482,6 +508,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, if traversal.from_sep_close_bigger_starts is not None: direct_result, timing_future = wrangler.eval_direct( + actx, traversal.target_boxes, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, @@ -496,6 +523,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 7:" propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( + actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) @@ -507,6 +535,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # {{{ "Stage 8:" evaluate locals local_result, timing_future = wrangler.eval_locals( + actx, traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) @@ -518,11 +547,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs, # }}} potentials = wrangler.gather_potential_results( - potentials, global_tgt_idx_all_ranks) + actx, + potentials, global_tgt_idx_all_ranks) result = wrangler.reorder_potentials(potentials) - result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0]) + result = wrangler.finalize_potentials(actx, result) fmm_proc.done() diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py index 869805b6..4f0ce75c 100644 --- a/boxtree/pyfmmlib_integration.py +++ b/boxtree/pyfmmlib_integration.py @@ -676,7 +676,9 @@ def reorder_potentials(self, potentials): @log_process(logger) @return_timing_data - def form_multipoles(self, level_start_source_box_nrs, source_boxes, + def form_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_box_nrs, + source_boxes, src_weight_vecs): src_weights, = src_weight_vecs formmp = self.tree_indep.get_routine( @@ -719,8 +721,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes, @log_process(logger) @return_timing_data - def coarsen_multipoles(self, level_start_source_parent_box_nrs, - source_parent_boxes, mpoles): + def coarsen_multipoles(self, actx: PyOpenCLArrayContext, + level_start_source_parent_box_nrs, + source_parent_boxes, + mpoles): tree = self.tree mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp") @@ -775,8 +779,11 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs, @log_process(logger) @return_timing_data - def eval_direct(self, target_boxes, neighbor_sources_starts, - neighbor_sources_lists, src_weight_vecs): + def eval_direct(self, actx: PyOpenCLArrayContext, + target_boxes, + neighbor_sources_starts, + neighbor_sources_lists, + src_weight_vecs): src_weights, = src_weight_vecs output = self.output_zeros() @@ -819,7 +826,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts, @log_process(logger) @return_timing_data - def multipole_to_local(self, + def multipole_to_local(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, mpole_exps): @@ -934,8 +941,9 @@ def multipole_to_local(self, @log_process(logger) @return_timing_data - def eval_multipoles(self, - target_boxes_by_source_level, sep_smaller_nonsiblings_by_level, + def eval_multipoles(self, actx: PyOpenCLArrayContext, + target_boxes_by_source_level, + sep_smaller_nonsiblings_by_level, mpole_exps): output = self.output_zeros() @@ -977,9 +985,10 @@ def eval_multipoles(self, @log_process(logger) @return_timing_data - def form_locals(self, + def form_locals(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, starts, lists, src_weight_vecs): + target_or_target_parent_boxes, + starts, lists, src_weight_vecs): src_weights, = src_weight_vecs local_exps = self.local_expansion_zeros() @@ -1057,8 +1066,10 @@ def form_locals(self, @log_process(logger) @return_timing_data - def refine_locals(self, level_start_target_or_target_parent_box_nrs, - target_or_target_parent_boxes, local_exps): + def refine_locals(self, actx: PyOpenCLArrayContext, + level_start_target_or_target_parent_box_nrs, + target_or_target_parent_boxes, + local_exps): locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc") @@ -1104,7 +1115,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs, @log_process(logger) @return_timing_data - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): + def eval_locals(self, actx: PyOpenCLArrayContext, + level_start_target_box_nrs, + target_boxes, + local_exps): output = self.output_zeros() taeval = self.tree_indep.get_expn_eval_routine("ta") @@ -1139,7 +1153,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): return output @log_process(logger) - def finalize_potentials(self, potential, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potential): if self.tree_indep.eqn_letter == "l" and self.dim == 2: scale_factor = -1/(2*np.pi) elif self.tree_indep.eqn_letter == "h" and self.dim == 2: diff --git a/boxtree/traversal.py b/boxtree/traversal.py index f1fd5ecf..5bec4015 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -1706,7 +1706,8 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype, sources_are_targets, sources_have_extent, targets_have_extent, extent_norm, source_boxes_has_mask, - source_parent_boxes_has_mask): + source_parent_boxes_has_mask, + debug=False): # {{{ process from_sep_smaller_crit @@ -1748,8 +1749,6 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype, # }}} - debug = False - from pyopencl.tools import dtype_to_ctype from boxtree.tree import box_flags_enum diff --git a/examples/cost_model.py b/examples/cost_model.py index 7b783891..8328672f 100644 --- a/examples/cost_model.py +++ b/examples/cost_model.py @@ -79,7 +79,7 @@ def fmm_level_to_order(tree, ilevel): timing_data = {} from boxtree.fmm import drive_fmm src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) diff --git a/test/test_cost_model.py b/test/test_cost_model.py index 7c382a1a..f3b53980 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -439,7 +439,7 @@ def fmm_level_to_order(tree, ilevel): timing_data = {} from boxtree.fmm import drive_fmm src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) @@ -566,7 +566,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( timing_data = {} from boxtree.fmm import drive_fmm src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) cost_model = FMMCostModel( translation_cost_model_factory=OpCountingTranslationCostModel diff --git a/test/test_distributed.py b/test/test_distributed.py index 41e5ac6e..45da4bfc 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -120,7 +120,7 @@ def fmm_level_to_order(tree, level): # Compute FMM with one MPI rank from boxtree.fmm import drive_fmm - pot_fmm = drive_fmm(wrangler, [sources_weights]) * 2 * np.pi + pot_fmm = drive_fmm(actx, wrangler, [sources_weights]) * 2 * np.pi # Compute FMM using the distributed implementation @@ -140,7 +140,7 @@ def wrangler_factory(local_traversal, global_traversal): timing_data = {} pot_dfmm = distributed_fmm_info.drive_dfmm( - [sources_weights], timing_data=timing_data) + actx, [sources_weights], timing_data=timing_data) assert timing_data # Uncomment the following section to print the time taken of each stage @@ -272,7 +272,7 @@ def wrangler_factory(local_traversal, global_traversal): distributed_fmm_info = DistributedFMMRunner( actx, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD) - pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights]) + pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights]) if rank == 0: assert (np.all(pot_dfmm == nsources)) diff --git a/test/test_fmm.py b/test/test_fmm.py index 659ae64e..460e56fb 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -52,7 +52,8 @@ # {{{ ref fmmlib pot computation -def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host, +def get_fmmlib_ref_pot( + actx, wrangler, weights, sources_host, targets_host, helmholtz_k, dipole_vec=None): dims = sources_host.shape[0] eqn_letter = "h" if helmholtz_k else "l" @@ -85,10 +86,10 @@ def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host, kwargs["zk"] = helmholtz_k return wrangler.finalize_potentials( + actx, fmmlib_routine( sources=sources_host, targets=targets_host, - **kwargs)[0], - template_ary=weights) + **kwargs)[0]) # }}} @@ -275,7 +276,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, == weights) from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) if filter_kind: pot = pot[actx.to_numpy(flags) > 0] @@ -293,7 +294,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req, for i in range(nsources): unit_vec = np.zeros(nsources, dtype=dtype) unit_vec[i] = 1 - mat[:, i] = drive_fmm(host_trav, wrangler, (unit_vec,)) + mat[:, i] = drive_fmm(actx, wrangler, (unit_vec,)) pb.progress() pb.finished() @@ -407,8 +408,8 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k): p_normal(actx, ntargets, dims, dtype, seed=18) + np.array([2, 0, 0])[:dims]) - sources_host = particle_array_to_host(actx, sources) - targets_host = particle_array_to_host(actx, targets) + sources_host = np.stack(actx.to_numpy(sources)) + targets_host = np.stack(actx.to_numpy(targets)) from boxtree import TreeBuilder tb = TreeBuilder(actx) @@ -459,7 +460,7 @@ def fmm_level_to_order(tree, lev): from boxtree.fmm import drive_fmm timing_data = {} - pot = drive_fmm(wrangler, (weights,), timing_data=timing_data) + pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data) print(timing_data) assert timing_data @@ -467,8 +468,8 @@ def fmm_level_to_order(tree, lev): logger.info("computing direct (reference) result") - ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources_host.T, - targets_host.T, helmholtz_k, dipole_vec) + ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources_host, + targets_host, helmholtz_k, dipole_vec) rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf) logger.info("relative l2 error vs fmmlib direct: %g", rel_err) @@ -505,11 +506,9 @@ def fmm_level_to_order(tree, lev): if use_dipoles: knl = DirectionalSourceDerivative(knl) - sumpy_extra_kwargs["src_derivative_dir"] = dipole_vec + sumpy_extra_kwargs["src_derivative_dir"] = actx.from_numpy(dipole_vec) - p2p = P2P(actx.context, - [knl], - exclude_self=False) + p2p = P2P(target_kernels=[knl], exclude_self=False) result, = p2p( actx, @@ -592,14 +591,14 @@ def fmm_level_to_order(tree, lev): rotation_data=FMMLibRotationData(actx, trav)) from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert not np.isnan(pot).any() # {{{ ref fmmlib computation logger.info("computing direct (reference) result") - ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources, targets, + ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources, targets, helmholtz_k) rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf) @@ -661,7 +660,7 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert np.all(pot == weights_sum) @@ -715,7 +714,7 @@ def test_fmm_float32(actx_factory, enable_extents): tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav) - pot = drive_fmm(wrangler, (weights,)) + pot = drive_fmm(actx, wrangler, (weights,)) assert np.all(pot == weights_sum) @@ -791,11 +790,11 @@ def fmm_level_to_order(tree, lev): baseline_timing_data = {} baseline_pot = drive_fmm( - baseline_wrangler, (weights,), timing_data=baseline_timing_data) + actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data) optimized_timing_data = {} optimized_pot = drive_fmm( - optimized_wrangler, (weights,), timing_data=optimized_timing_data) + actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data) baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"] if baseline_time is not None: From 4024531a397a8254197adefa18047688a69b9e09 Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Fri, 23 Sep 2022 20:38:42 +0300 Subject: [PATCH 25/28] rip out timing collection --- boxtree/constant_one.py | 41 ++------ boxtree/distributed/__init__.py | 6 +- boxtree/fmm.py | 93 ++++++----------- boxtree/pyfmmlib_integration.py | 15 +-- boxtree/timing.py | 171 -------------------------------- doc/misc.rst | 19 +++- doc/tools.rst | 2 - examples/cost_model.py | 8 +- test/test_cost_model.py | 14 +-- test/test_distributed.py | 16 +-- test/test_fmm.py | 22 +--- 11 files changed, 69 insertions(+), 338 deletions(-) delete mode 100644 boxtree/timing.py diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py index 11acf0b4..a771e44e 100644 --- a/boxtree/constant_one.py +++ b/boxtree/constant_one.py @@ -29,7 +29,6 @@ from boxtree.array_context import PyOpenCLArrayContext from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler -from boxtree.timing import DummyTimingFuture # {{{ constant one wrangler @@ -45,9 +44,6 @@ class ConstantOneExpansionWrangler(ExpansionWranglerInterface): """This implements the 'analytical routines' for a Green's function that is constant 1 everywhere. For 'charges' of 'ones', this should get every particle a copy of the particle count. - - Timing results returned by this wrangler contain the field *ops_elapsed*, - which counts approximately the number of floating-point operations required. """ def _get_source_slice(self, ibox): @@ -82,31 +78,24 @@ def local_expansions_view(self, local_exps, level): # FIXME raise NotImplementedError - @staticmethod - def timing_future(ops): - return DummyTimingFuture.from_op_count(ops) - def form_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): src_weights, = src_weight_vecs mpoles = self.multipole_expansion_zeros() - ops = 0 for ibox in source_boxes: pslice = self._get_source_slice(ibox) mpoles[ibox] += np.sum(src_weights[pslice]) - ops += src_weights[pslice].size - return mpoles, self.timing_future(ops) + return mpoles def coarsen_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): tree = self.tree - ops = 0 # nlevels-1 is the last valid level index # nlevels-2 is the last valid level that could have children @@ -122,16 +111,14 @@ def coarsen_multipoles(self, actx: PyOpenCLArrayContext, for child in tree.box_child_ids[:, ibox]: if child: mpoles[ibox] += mpoles[child] - ops += 1 - return mpoles, self.timing_future(ops) + return mpoles def eval_direct(self, actx: PyOpenCLArrayContext, target_boxes, neighbor_sources_starts, neighbor_sources_lists, src_weight_vecs): src_weights, = src_weight_vecs pot = self.output_zeros() - ops = 0 for itgt_box, tgt_ibox in enumerate(target_boxes): tgt_pslice = self._get_target_slice(tgt_ibox) @@ -147,9 +134,8 @@ def eval_direct(self, actx: PyOpenCLArrayContext, src_sum += np.sum(src_weights[src_pslice]) pot[tgt_pslice] = src_sum - ops += pot[tgt_pslice].size * nsrcs - return pot, self.timing_future(ops) + return pot def multipole_to_local(self, actx: PyOpenCLArrayContext, @@ -157,7 +143,6 @@ def multipole_to_local(self, target_or_target_parent_boxes, starts, lists, mpole_exps): local_exps = self.local_expansion_zeros() - ops = 0 for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes): start, end = starts[itgt_box:itgt_box+2] @@ -166,11 +151,10 @@ def multipole_to_local(self, # print tgt_ibox, "<-", lists[start:end] for src_ibox in lists[start:end]: contrib += mpole_exps[src_ibox] - ops += 1 local_exps[tgt_ibox] += contrib - return local_exps, self.timing_future(ops) + return local_exps def eval_multipoles(self, actx: PyOpenCLArrayContext, @@ -178,7 +162,6 @@ def eval_multipoles(self, from_sep_smaller_nonsiblings_by_level, mpole_exps): pot = self.output_zeros() - ops = 0 for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level): for itgt_box, tgt_ibox in \ @@ -192,9 +175,8 @@ def eval_multipoles(self, contrib += mpole_exps[src_ibox] pot[tgt_pslice] += contrib - ops += pot[tgt_pslice].size * (end - start) - return pot, self.timing_future(ops) + return pot def form_locals(self, actx: PyOpenCLArrayContext, @@ -203,7 +185,6 @@ def form_locals(self, starts, lists, src_weight_vecs): src_weights, = src_weight_vecs local_exps = self.local_expansion_zeros() - ops = 0 for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes): start, end = starts[itgt_box:itgt_box+2] @@ -218,38 +199,32 @@ def form_locals(self, contrib += np.sum(src_weights[src_pslice]) local_exps[tgt_ibox] += contrib - ops += nsrcs - return local_exps, self.timing_future(ops) + return local_exps def refine_locals(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): - ops = 0 - for target_lev in range(1, self.tree.nlevels): start, stop = level_start_target_or_target_parent_box_nrs[ target_lev:target_lev+2] for ibox in target_or_target_parent_boxes[start:stop]: local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]] - ops += 1 - return local_exps, self.timing_future(ops) + return local_exps def eval_locals(self, actx: PyOpenCLArrayContext, level_start_target_box_nrs, target_boxes, local_exps): pot = self.output_zeros() - ops = 0 for ibox in target_boxes: tgt_pslice = self._get_target_slice(ibox) pot[tgt_pslice] += local_exps[ibox] - ops += pot[tgt_pslice].size - return pot, self.timing_future(ops) + return pot def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): return potentials diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py index 102a2531..7974ab58 100644 --- a/boxtree/distributed/__init__.py +++ b/boxtree/distributed/__init__.py @@ -293,16 +293,12 @@ def __init__(self, array_context: PyOpenCLArrayContext, global_tree, array_context, global_tree, traversal_builder, wrangler_factory, calibration_params, comm) - def drive_dfmm(self, - actx: PyOpenCLArrayContext, - source_weights, - timing_data=None): + def drive_dfmm(self, actx: PyOpenCLArrayContext, source_weights): """Calculate potentials at target points.""" from boxtree.fmm import drive_fmm return drive_fmm( actx, self.wrangler, source_weights, - timing_data=timing_data, global_src_idx_all_ranks=self.src_idx_all_ranks, global_tgt_idx_all_ranks=self.tgt_idx_all_ranks) diff --git a/boxtree/fmm.py b/boxtree/fmm.py index 760e67bd..3ac604c5 100644 --- a/boxtree/fmm.py +++ b/boxtree/fmm.py @@ -74,13 +74,15 @@ class ExpansionWranglerInterface(ABC): :class:`TreeIndependentDataForWrangler` exists to hold data that is more broadly reusable. - Functions that support returning timing data return a value supporting the - :class:`~boxtree.timing.TimingFuture` interface. - .. versionchanged:: 2018.1 Changed (a subset of) functions to return timing data. + .. versionchanged:: 2022.1 + + Removed timing data that should be handled by the + :class:`~arraycontext.ArrayContext`. + .. attribute:: tree_indep An instance of (a typically wrangler-dependent subclass of) @@ -160,12 +162,10 @@ def form_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): - """Return an expansions array - containing multipole expansions in *source_boxes* due to sources - with *src_weight_vecs*. - All other expansions must be zero. - - :return: A pair (*mpoles*, *timing_future*). + """ + :returns: an expansions array containing multipole expansions in + *source_boxes* due to sources with *src_weight_vecs*. + All other expansions must be zero. """ @abstractmethod @@ -173,12 +173,11 @@ def coarsen_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): - """For each box in *source_parent_boxes*, - gather (and translate) the box's children's multipole expansions in - *mpole* and add the resulting expansion into the box's multipole - expansion in *mpole*. + """For each box in *source_parent_boxes*, gather (and translate) the + box's children's multipole expansions in *mpoles* and add the + resulting expansion into the box's multipole expansion in *mpoles*. - :returns: A pair (*mpoles*, *timing_future*). + :returns: the updated *mpoles*. """ @abstractmethod @@ -190,8 +189,7 @@ def eval_direct(self, neighbor sources due to *src_weight_vecs*, which use :ref:`csr` and are indexed like *target_boxes*. - :returns: A pair (*pot*, *timing_future*), where *pot* is a - a new potential array. + :returns: a new potential array. """ @abstractmethod @@ -205,8 +203,7 @@ def multipole_to_local(self, array of local expansions. *starts* and *lists* use :ref:`csr`, and *starts* is indexed like *target_or_target_parent_boxes*. - :returns: A pair (*pot*, *timing_future*) where *pot* is - a new (local) expansion array. + :returns: a new (local) expansion array. """ @abstractmethod @@ -219,8 +216,7 @@ def eval_multipoles(self, *starts* and *lists* in *from_sep_smaller_by_level[i]* use :ref:`csr` and *starts* is indexed like *target_boxes_by_source_level[i]*. - :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential - array. + :returns: a new potential array. """ @abstractmethod @@ -234,8 +230,7 @@ def form_locals(self, use :ref:`csr` and *starts* is indexed like *target_or_target_parent_boxes*. - :returns: A pair (*pot*, *timing_future*) where *pot* is a new - local expansion array. + :returns: a new local expansion array. """ @abstractmethod @@ -247,7 +242,7 @@ def refine_locals(self, translate the box's parent's local expansion in *local_exps* and add the resulting expansion into the box's local expansion in *local_exps*. - :returns: A pair (*local_exps*, *timing_future*). + :returns: an updated local expansion array *local_exps*. """ @abstractmethod @@ -257,8 +252,7 @@ def eval_locals(self, """For each box in *target_boxes*, evaluate the local expansion in *local_exps* and return a new potential array. - :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential - array. + :returns: a new potential array. """ # }}} @@ -343,7 +337,6 @@ def communicate_mpoles(self, # noqa: B027 def drive_fmm(actx: PyOpenCLArrayContext, wrangler: ExpansionWranglerInterface, src_weight_vecs, *, - timing_data=None, global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None): """Top-level driver routine for a fast multipole calculation. @@ -364,9 +357,6 @@ def drive_fmm(actx: PyOpenCLArrayContext, Passed unmodified to *expansion_wrangler*. For distributed implementation, this argument is only significant on the root rank, but worker ranks still need to supply a dummy vector. - :arg timing_data: Either *None*, or a :class:`dict` that is populated with - timing information for the stages of the algorithm (in the form of - :class:`~boxtree.timing.TimingResult`), if such information is available. :arg global_src_idx_all_ranks: Only used in the distributed implementation. A :class:`list` of length ``nranks``, where the i-th entry is a :class:`numpy.ndarray` representing the global indices of sources in the @@ -388,9 +378,7 @@ def drive_fmm(actx: PyOpenCLArrayContext, # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. - from boxtree.timing import TimingRecorder fmm_proc = ProcessLogger(logger, "fmm") - recorder = TimingRecorder() src_weight_vecs = [ wrangler.reorder_sources(weight) for weight in src_weight_vecs] @@ -401,26 +389,22 @@ def drive_fmm(actx: PyOpenCLArrayContext, # {{{ "Step 2.1:" Construct local multipoles - mpole_exps, timing_future = wrangler.form_multipoles( + mpole_exps = wrangler.form_multipoles( actx, traversal.level_start_source_box_nrs, traversal.source_boxes, src_weight_vecs) - recorder.add("form_multipoles", timing_future) - # }}} # {{{ "Step 2.2:" Propagate multipoles upward - mpole_exps, timing_future = wrangler.coarsen_multipoles( + mpole_exps = wrangler.coarsen_multipoles( actx, traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) - recorder.add("coarsen_multipoles", timing_future) - # mpole_exps is called Phi in [1] # }}} @@ -429,22 +413,20 @@ def drive_fmm(actx: PyOpenCLArrayContext, # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1") - potentials, timing_future = wrangler.eval_direct( + potentials = wrangler.eval_direct( actx, traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_weight_vecs) - recorder.add("eval_direct", timing_future) - # these potentials are called alpha in [1] # }}} # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local - local_exps, timing_future = wrangler.multipole_to_local( + local_exps = wrangler.multipole_to_local( actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, @@ -452,8 +434,6 @@ def drive_fmm(actx: PyOpenCLArrayContext, traversal.from_sep_siblings_lists, mpole_exps) - recorder.add("multipole_to_local", timing_future) - # local_exps represents both Gamma and Delta in [1] # }}} @@ -463,14 +443,12 @@ def drive_fmm(actx: PyOpenCLArrayContext, # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) - mpole_result, timing_future = wrangler.eval_multipoles( + mpole_result = wrangler.eval_multipoles( actx, traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) - recorder.add("eval_multipoles", timing_future) - potentials = potentials + mpole_result # these potentials are called beta in [1] @@ -479,22 +457,20 @@ def drive_fmm(actx: PyOpenCLArrayContext, logger.debug("evaluate separated close smaller interactions directly " "('list 3 close')") - direct_result, timing_future = wrangler.eval_direct( + direct_result = wrangler.eval_direct( actx, traversal.target_boxes, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, src_weight_vecs) - recorder.add("eval_direct", timing_future) - potentials = potentials + direct_result # }}} # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4") - local_result, timing_future = wrangler.form_locals( + local_result = wrangler.form_locals( actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, @@ -502,46 +478,38 @@ def drive_fmm(actx: PyOpenCLArrayContext, traversal.from_sep_bigger_lists, src_weight_vecs) - recorder.add("form_locals", timing_future) - local_exps = local_exps + local_result if traversal.from_sep_close_bigger_starts is not None: - direct_result, timing_future = wrangler.eval_direct( + direct_result = wrangler.eval_direct( actx, traversal.target_boxes, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, src_weight_vecs) - recorder.add("eval_direct", timing_future) - potentials = potentials + direct_result # }}} # {{{ "Stage 7:" propagate local_exps downward - local_exps, timing_future = wrangler.refine_locals( + local_exps = wrangler.refine_locals( actx, traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) - recorder.add("refine_locals", timing_future) - # }}} # {{{ "Stage 8:" evaluate locals - local_result, timing_future = wrangler.eval_locals( + local_result = wrangler.eval_locals( actx, traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) - recorder.add("eval_locals", timing_future) - potentials = potentials + local_result # }}} @@ -556,9 +524,6 @@ def drive_fmm(actx: PyOpenCLArrayContext, fmm_proc.done() - if timing_data is not None: - timing_data.update(recorder.summarize()) - return result diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py index 4f0ce75c..a9764a7f 100644 --- a/boxtree/pyfmmlib_integration.py +++ b/boxtree/pyfmmlib_integration.py @@ -44,7 +44,6 @@ from boxtree.array_context import PyOpenCLArrayContext from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler -from boxtree.timing import return_timing_data logger = logging.getLogger(__name__) @@ -271,11 +270,7 @@ def wrapper(*args, **kwargs): class FMMLibExpansionWrangler(ExpansionWranglerInterface): """Implements the :class:`boxtree.fmm.ExpansionWranglerInterface` - by using pyfmmlib. - - Timing results returned by this wrangler contains the values *wall_elapsed* - and (optionally, if supported) *process_elapsed*, which measure wall time - and process time in seconds, respectively. + by using ``pyfmmlib``. """ # {{{ constructor @@ -675,7 +670,6 @@ def reorder_potentials(self, potentials): return potentials[self.tree.sorted_target_ids] @log_process(logger) - @return_timing_data def form_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, @@ -720,7 +714,6 @@ def form_multipoles(self, actx: PyOpenCLArrayContext, return mpoles @log_process(logger) - @return_timing_data def coarsen_multipoles(self, actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, @@ -778,7 +771,6 @@ def coarsen_multipoles(self, actx: PyOpenCLArrayContext, return mpoles @log_process(logger) - @return_timing_data def eval_direct(self, actx: PyOpenCLArrayContext, target_boxes, neighbor_sources_starts, @@ -825,7 +817,6 @@ def eval_direct(self, actx: PyOpenCLArrayContext, return output @log_process(logger) - @return_timing_data def multipole_to_local(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, @@ -940,7 +931,6 @@ def multipole_to_local(self, actx: PyOpenCLArrayContext, return local_exps @log_process(logger) - @return_timing_data def eval_multipoles(self, actx: PyOpenCLArrayContext, target_boxes_by_source_level, sep_smaller_nonsiblings_by_level, @@ -984,7 +974,6 @@ def eval_multipoles(self, actx: PyOpenCLArrayContext, return output @log_process(logger) - @return_timing_data def form_locals(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, @@ -1065,7 +1054,6 @@ def form_locals(self, actx: PyOpenCLArrayContext, return local_exps @log_process(logger) - @return_timing_data def refine_locals(self, actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, @@ -1114,7 +1102,6 @@ def refine_locals(self, actx: PyOpenCLArrayContext, return local_exps @log_process(logger) - @return_timing_data def eval_locals(self, actx: PyOpenCLArrayContext, level_start_target_box_nrs, target_boxes, diff --git a/boxtree/timing.py b/boxtree/timing.py deleted file mode 100644 index e3bad59b..00000000 --- a/boxtree/timing.py +++ /dev/null @@ -1,171 +0,0 @@ -""" -.. autoclass:: TimingResult - -.. autoclass:: TimingFuture -""" - -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -from collections.abc import Mapping - - -# {{{ timing result - -class TimingResult(Mapping): - """Interface for returned timing data. - - This supports accessing timing results via a mapping interface, along with - combining results via :meth:`merge`. - - .. automethod:: merge - """ - - def __init__(self, *args, **kwargs): - """See constructor for :class:`dict`.""" - self._mapping = dict(*args, **kwargs) - - def __getitem__(self, key): - return self._mapping[key] - - def __iter__(self): - return iter(self._mapping) - - def __len__(self): - return len(self._mapping) - - def merge(self, other): - """Merge this result with another by adding together common fields.""" - result = {} - - for key in self: - val = self.get(key) - other_val = other.get(key) - - if val is None or other_val is None: - continue - - result[key] = val + other_val - - return type(self)(result) - -# }}} - - -# {{{ timing future - -class TimingFuture: - """Returns timing data for a potentially asynchronous operation. - - .. automethod:: result - .. automethod:: done - """ - - def result(self): - """Return a :class:`TimingResult`. May block.""" - raise NotImplementedError - - def done(self): - """Return *True* if the operation is complete.""" - raise NotImplementedError - -# }}} - - -# {{{ timing recorder - -class TimingRecorder: - - def __init__(self): - from collections import defaultdict - self.futures = defaultdict(list) - - def add(self, description, future): - self.futures[description].append(future) - - def summarize(self): - result = {} - - for description, futures_list in self.futures.items(): - futures = iter(futures_list) - - timing_result = next(futures).result() - for future in futures: - timing_result = timing_result.merge(future.result()) - - result[description] = timing_result - - return result - -# }}} - - -# {{{ time recording tool - -class DummyTimingFuture(TimingFuture): - @classmethod - def from_timer(cls, timer): - return cls(wall_elapsed=timer.wall_elapsed, - process_elapsed=timer.process_elapsed) - - @classmethod - def from_op_count(cls, op_count): - return cls(ops_elapsed=op_count) - - def __init__(self, *args, **kwargs): - self._result = TimingResult(*args, **kwargs) - - def result(self): - return self._result - - def done(self): - return True - - -def return_timing_data(wrapped): - """A decorator for recording timing data for a function call. - - The decorated function returns a tuple (*retval*, *timing_future*) - where *retval* is the original return value and *timing_future* - supports the timing data future interface in :mod:`boxtree.fmm`. - """ - - from pytools import ProcessTimer - - def wrapper(*args, **kwargs): - timer = ProcessTimer() - retval = wrapped(*args, **kwargs) - timer.done() - - future = DummyTimingFuture.from_timer(timer) - return (retval, future) - - from functools import update_wrapper - new_wrapper = update_wrapper(wrapper, wrapped) - - return new_wrapper - -# }}} - - -# vim: foldmethod=marker diff --git a/doc/misc.rst b/doc/misc.rst index 37f83588..ebfaa9f3 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -27,13 +27,24 @@ For development, you may want to install in `editable mode User-visible Changes ==================== -Version 2019.1 +.. note:: + + You can get snapshots of in-development versions from + :mod:`boxtree`'s `git repository `_. + +Version 2024.1 -------------- -.. note:: +* Use :mod:`arraycontext` as the main array abstraction (over :mod:`pyopencl` + only at the moment). This changed the API of many functions and classes, + since most of them now take an :class:`~arraycontext.ArrayContext` instead + of a :class:`pyopencl.Context`. +* Remove (temporarily) cost model support. This removed the *timing_data* + parameter and return values from the FMM driver. +* Removed *DeviceDataRecord* in favour of array containers from :mod:`arraycontext`. - This version is currently under development. You can get snapshots from - boxtree's `git repository `__ +Version 2019.1 +-------------- * Faster M2Ls in the FMMLIB backend using precomputed rotation matrices. This change adds an optional *rotation_data* parameter to the FMMLIB geometry wrangler diff --git a/doc/tools.rst b/doc/tools.rst index 0b5225ee..fd3fc963 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -1,8 +1,6 @@ Utility Functionality ===================== -.. automodule:: boxtree.timing - .. automodule:: boxtree.constant_one .. automodule:: boxtree.array_context diff --git a/examples/cost_model.py b/examples/cost_model.py index 8328672f..87565918 100644 --- a/examples/cost_model.py +++ b/examples/cost_model.py @@ -76,12 +76,9 @@ def fmm_level_to_order(tree, ilevel): fmm_level_to_order=fmm_level_to_order) level_orders_list.append(wrangler.level_orders) - timing_data = {} from boxtree.fmm import drive_fmm src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) - - timing_results.append(timing_data) + drive_fmm(actx, wrangler, (src_weights,)) time_field_name = "process_elapsed" @@ -99,6 +96,9 @@ def fmm_level_to_order(tree, ilevel): ) queue.finish() + if not timing_results: + return + params = cost_model.estimate_calibration_params( model_results, timing_results[:-1], time_field_name=time_field_name ) diff --git a/test/test_cost_model.py b/test/test_cost_model.py index f3b53980..ec5f1d75 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -375,6 +375,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt # {{{ test_estimate_calibration_params @pytest.mark.opencl +@pytest.mark.skip(reason="cost model is not functional") def test_estimate_calibration_params(actx_factory): from boxtree.pyfmmlib_integration import ( FMMLibExpansionWrangler, @@ -436,12 +437,9 @@ def fmm_level_to_order(tree, ilevel): fmm_level_to_order=fmm_level_to_order) level_to_orders.append(wrangler.level_orders) - timing_data = {} from boxtree.fmm import drive_fmm src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) - - timing_results.append(timing_data) + drive_fmm(actx, wrangler, (src_weights,)) time_field_name = "process_elapsed" @@ -458,7 +456,6 @@ def test_params_equal(test_params1, test_params2): assert test_params1[name] == test_params2[name] python_cost_model = _PythonFMMCostModel(make_pde_aware_translation_cost_model) - python_model_results = [] for icase in range(len(traversals)-1): @@ -563,10 +560,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( tree_indep = ConstantOneTreeIndependentDataForWrangler() wrangler = ConstantOneExpansionWrangler(tree_indep, trav) - timing_data = {} from boxtree.fmm import drive_fmm + timing_data = {} src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype) - drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data) + drive_fmm(actx, wrangler, (src_weights,)) cost_model = FMMCostModel( translation_cost_model_factory=OpCountingTranslationCostModel @@ -579,6 +576,9 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler( FMMCostModel.get_unit_calibration_params(), ) + if not timing_data: + return + mismatches = [] for stage in timing_data: if timing_data[stage]["ops_elapsed"] != modeled_time[stage]: diff --git a/test/test_distributed.py b/test/test_distributed.py index 45da4bfc..5faa8dde 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -138,21 +138,7 @@ def wrangler_factory(local_traversal, global_traversal): distributed_fmm_info = DistributedFMMRunner( actx, global_tree_host, tg, wrangler_factory, comm=comm) - timing_data = {} - pot_dfmm = distributed_fmm_info.drive_dfmm( - actx, [sources_weights], timing_data=timing_data) - assert timing_data - - # Uncomment the following section to print the time taken of each stage - """ - if rank == 1: - from pytools import Table - table = Table() - table.add_row(["stage", "time (s)"]) - for stage in timing_data: - table.add_row([stage, "%.2f" % timing_data[stage]["wall_elapsed"]]) - print(table) - """ + pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights]) if rank == 0: error = (la.norm(pot_fmm - pot_dfmm * 2 * np.pi, ord=np.inf) diff --git a/test/test_fmm.py b/test/test_fmm.py index 460e56fb..7e4d0896 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -459,10 +459,7 @@ def fmm_level_to_order(tree, lev): from boxtree.fmm import drive_fmm - timing_data = {} - pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data) - print(timing_data) - assert timing_data + pot = drive_fmm(actx, wrangler, (weights,)) # {{{ ref fmmlib computation @@ -788,21 +785,8 @@ def fmm_level_to_order(tree, lev): from boxtree.fmm import drive_fmm - baseline_timing_data = {} - baseline_pot = drive_fmm( - actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data) - - optimized_timing_data = {} - optimized_pot = drive_fmm( - actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data) - - baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"] - if baseline_time is not None: - print(f"Baseline M2L time : {baseline_time:#.4g} s") - - opt_time = optimized_timing_data["multipole_to_local"]["process_elapsed"] - if opt_time is not None: - print(f"Optimized M2L time: {opt_time:#.4g} s") + baseline_pot = drive_fmm(actx, baseline_wrangler, (weights,)) + optimized_pot = drive_fmm(actx, optimized_wrangler, (weights,)) assert np.allclose(baseline_pot, optimized_pot, atol=1e-13, rtol=1e-13) From 5b14fe16c914ae641abce1b6823cee4aa21e78cb Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Fri, 19 Jul 2024 21:18:23 +0300 Subject: [PATCH 26/28] ruff: mark arraycontext as first-party --- boxtree/area_query.py | 2 +- boxtree/array_context.py | 2 +- boxtree/distributed/local_tree.py | 2 +- boxtree/distributed/partition.py | 2 +- boxtree/rotation_classes.py | 2 +- boxtree/translation_classes.py | 2 +- boxtree/traversal.py | 2 +- boxtree/tree.py | 2 +- pyproject.toml | 9 +++++---- test/test_cost_model.py | 1 + test/test_distributed.py | 1 + test/test_fmm.py | 1 + test/test_tools.py | 1 + test/test_traversal.py | 1 + test/test_tree.py | 1 + test/test_tree_of_boxes.py | 1 + 16 files changed, 20 insertions(+), 12 deletions(-) diff --git a/boxtree/area_query.py b/boxtree/area_query.py index 23f78ba2..0654b6f9 100644 --- a/boxtree/area_query.py +++ b/boxtree/area_query.py @@ -28,9 +28,9 @@ from functools import partial import numpy as np -from arraycontext import Array from mako.template import Template +from arraycontext import Array from pyopencl.elementwise import ElementwiseTemplate from pytools import ProcessLogger, memoize_method diff --git a/boxtree/array_context.py b/boxtree/array_context.py index 5fe85c5c..2b0779eb 100644 --- a/boxtree/array_context.py +++ b/boxtree/array_context.py @@ -21,6 +21,7 @@ """ import numpy as np + from arraycontext import ( # noqa: F401 PyOpenCLArrayContext as PyOpenCLArrayContextBase, deserialize_container, @@ -32,7 +33,6 @@ _PytestPyOpenCLArrayContextFactoryWithClass, register_pytest_array_context_factory, ) - from pyopencl.algorithm import BuiltList diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py index 1a5bb1a6..b1852a83 100644 --- a/boxtree/distributed/local_tree.py +++ b/boxtree/distributed/local_tree.py @@ -26,9 +26,9 @@ from dataclasses import dataclass import numpy as np -from arraycontext import Array, ArrayOrContainer from mako.template import Template +from arraycontext import Array, ArrayOrContainer from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype from pytools import memoize_method diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py index 95f40037..32054142 100644 --- a/boxtree/distributed/partition.py +++ b/boxtree/distributed/partition.py @@ -24,9 +24,9 @@ from dataclasses import dataclass import numpy as np -from arraycontext import Array from mako.template import Template +from arraycontext import Array from pyopencl.elementwise import ElementwiseKernel from pyopencl.tools import dtype_to_ctype from pytools import memoize_method diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py index 43e1b759..0ecd2c40 100644 --- a/boxtree/rotation_classes.py +++ b/boxtree/rotation_classes.py @@ -36,8 +36,8 @@ from dataclasses import dataclass import numpy as np -from arraycontext import Array +from arraycontext import Array from pytools import log_process from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py index 073a4a9c..301eb0d7 100644 --- a/boxtree/translation_classes.py +++ b/boxtree/translation_classes.py @@ -37,9 +37,9 @@ from functools import partial import numpy as np -from arraycontext import Array from mako.template import Template +from arraycontext import Array from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate from pytools import memoize_method diff --git a/boxtree/traversal.py b/boxtree/traversal.py index 5bec4015..3e9b62b9 100644 --- a/boxtree/traversal.py +++ b/boxtree/traversal.py @@ -40,9 +40,9 @@ from functools import partial import numpy as np -from arraycontext import Array from mako.template import Template +from arraycontext import Array from pyopencl.algorithm import ListOfListsBuilder from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate from pytools import ProcessLogger, log_process, memoize_method diff --git a/boxtree/tree.py b/boxtree/tree.py index f4a97d1f..a92379cc 100644 --- a/boxtree/tree.py +++ b/boxtree/tree.py @@ -81,8 +81,8 @@ from functools import cached_property import numpy as np -from arraycontext import Array +from arraycontext import Array from cgen import Enum from pytools import memoize_method diff --git a/pyproject.toml b/pyproject.toml index a9c478d8..08c7f279 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,13 +96,14 @@ multiline-quotes = "double" [tool.ruff.lint.isort] combine-as-imports = true known-first-party = [ - "pytools", - "pymbolic", + "arraycontext", + "cgen", "loopy", - "pyopencl", "meshmode", "modepy", - "cgen" + "pymbolic", + "pyopencl", + "pytools", ] known-local-folder = [ "boxtree", diff --git a/test/test_cost_model.py b/test/test_cost_model.py index ec5f1d75..bafa92fc 100644 --- a/test/test_cost_model.py +++ b/test/test_cost_model.py @@ -30,6 +30,7 @@ import numpy as np import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf # noqa: F401 diff --git a/test/test_distributed.py b/test/test_distributed.py index 5faa8dde..de97b4f4 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -27,6 +27,7 @@ import numpy as np import numpy.linalg as la import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( diff --git a/test/test_fmm.py b/test/test_fmm.py index 7e4d0896..7d9c7877 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -25,6 +25,7 @@ import numpy as np import numpy.linalg as la import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( diff --git a/test/test_tools.py b/test/test_tools.py index 16b65307..a19f465d 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -25,6 +25,7 @@ import numpy as np import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( diff --git a/test/test_traversal.py b/test/test_traversal.py index 6de51b36..e3b3f838 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -25,6 +25,7 @@ import numpy as np import numpy.linalg as la import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import ( diff --git a/test/test_tree.py b/test/test_tree.py index aa6076f3..59783642 100644 --- a/test/test_tree.py +++ b/test/test_tree.py @@ -25,6 +25,7 @@ import numpy as np import pytest + from arraycontext import pytest_generate_tests_for_array_contexts from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf # noqa: F401 diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py index b7e798a3..894fc9c7 100644 --- a/test/test_tree_of_boxes.py +++ b/test/test_tree_of_boxes.py @@ -25,6 +25,7 @@ import numpy as np import pytest + from arraycontext import pytest_generate_tests_for_array_contexts # This means boxtree's tests have a hard dependency on meshmode. That's OK. From 0daa49615d1cedea1b96b976f63a524b90ba969f Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Tue, 2 Aug 2022 10:23:55 +0300 Subject: [PATCH 27/28] point ci to modified downstreams --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index affa894c..3ec5ce20 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -89,8 +89,8 @@ jobs: curl -L -O https://tiker.net/ci-support-v0 . ci-support-v0 - if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then - DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms + if [[ "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then + DOWNSTREAM_PROJECT=https://github.com/alexfikl/${DOWNSTREAM_PROJECT}.git@towards-array-context fi test_downstream "$DOWNSTREAM_PROJECT" From 36edb032d7c6a38813528c6db0ed7fa2a5b9421c Mon Sep 17 00:00:00 2001 From: Alexandru Fikl Date: Fri, 19 Jul 2024 22:09:07 +0300 Subject: [PATCH 28/28] test: wrap meshmode array context --- boxtree/array_context.py | 73 +++++++++++++++++++++----------------- test/test_tree_of_boxes.py | 23 ++++++++++-- 2 files changed, 61 insertions(+), 35 deletions(-) diff --git a/boxtree/array_context.py b/boxtree/array_context.py index 2b0779eb..8fdce077 100644 --- a/boxtree/array_context.py +++ b/boxtree/array_context.py @@ -43,6 +43,42 @@ # {{{ array context +def _boxtree_rec_map_container(actx, func, array, allowed_types=None, *, + default_scalar=None, strict=False): + import arraycontext.impl.pyopencl.taggable_cl_array as tga + + if allowed_types is None: + allowed_types = (tga.TaggableCLArray,) + + def _wrapper(ary): + # NOTE: this is copied verbatim from arraycontext and this is the + # only change to allow optional fields inside containers + if ary is None: + return ary + + if isinstance(ary, allowed_types): + return func(ary) + elif not strict and isinstance(ary, actx.array_types): + from warnings import warn + warn(f"Invoking {type(actx).__name__}.{func.__name__[1:]} with " + f"{type(ary).__name__} will be unsupported in 2025. Use " + "'to_tagged_cl_array' to convert instances to TaggableCLArray.", + DeprecationWarning, stacklevel=2) + return func(tga.to_tagged_cl_array(ary)) + elif np.isscalar(ary): + if default_scalar is None: + return ary + else: + return np.array(ary).dtype.type(default_scalar) + else: + raise TypeError( + f"{type(actx).__name__}.{func.__name__[1:]} invoked with " + f"an unsupported array type: got '{type(ary).__name__}', " + f"but expected one of {allowed_types}") + + return rec_map_array_container(_wrapper, array) + + class PyOpenCLArrayContext(PyOpenCLArrayContextBase): def transform_loopy_program(self, t_unit): default_ep = t_unit.default_entrypoint @@ -61,38 +97,11 @@ def transform_loopy_program(self, t_unit): def _rec_map_container(self, func, array, allowed_types=None, *, default_scalar=None, strict=False): - import arraycontext.impl.pyopencl.taggable_cl_array as tga - - if allowed_types is None: - allowed_types = (tga.TaggableCLArray,) - - def _wrapper(ary): - # NOTE: this is copied verbatim from arraycontext and this is the - # only change to allow optional fields inside containers - if ary is None: - return ary - - if isinstance(ary, allowed_types): - return func(ary) - elif not strict and isinstance(ary, self.array_types): - from warnings import warn - warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with " - f"{type(ary).__name__} will be unsupported in 2025. Use " - "'to_tagged_cl_array' to convert instances to TaggableCLArray.", - DeprecationWarning, stacklevel=2) - return func(tga.to_tagged_cl_array(ary)) - elif np.isscalar(ary): - if default_scalar is None: - return ary - else: - return np.array(ary).dtype.type(default_scalar) - else: - raise TypeError( - f"{type(self).__name__}.{func.__name__[1:]} invoked with " - f"an unsupported array type: got '{type(ary).__name__}', " - f"but expected one of {allowed_types}") - - return rec_map_array_container(_wrapper, array) + return _boxtree_rec_map_container( + self, func, array, + allowed_types=allowed_types, + default_scalar=default_scalar, + strict=strict) # }}} diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py index 894fc9c7..b26f9770 100644 --- a/test/test_tree_of_boxes.py +++ b/test/test_tree_of_boxes.py @@ -30,7 +30,10 @@ # This means boxtree's tests have a hard dependency on meshmode. That's OK. from meshmode import _acf # noqa: F401 -from meshmode.array_context import PytestPyOpenCLArrayContextFactory +from meshmode.array_context import ( + PyOpenCLArrayContext, + PytestPyOpenCLArrayContextFactory, +) from boxtree import ( make_meshmode_mesh_from_leaves, @@ -39,10 +42,24 @@ ) -logger = logging.getLogger(__name__) +class ArrayContext(PyOpenCLArrayContext): + def _rec_map_container(self, func, array, allowed_types=None, *, + default_scalar=None, strict=False): + from boxtree.array_context import _boxtree_rec_map_container + return _boxtree_rec_map_container( + self, func, array, + allowed_types=allowed_types, + default_scalar=default_scalar, + strict=strict) + +class ContextFactory(PytestPyOpenCLArrayContextFactory): + actx_class = ArrayContext + + +logger = logging.getLogger(__name__) pytest_generate_tests = pytest_generate_tests_for_array_contexts([ - PytestPyOpenCLArrayContextFactory, + ContextFactory, ])