From 6e3e42feefee092b698e730983435b140444d3df Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Mon, 20 Jun 2022 21:11:03 +0300
Subject: [PATCH 01/28] deprecate DeviceDataRecord

---
 boxtree/tools.py | 88 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 57 insertions(+), 31 deletions(-)

diff --git a/boxtree/tools.py b/boxtree/tools.py
index 244c71eb..0af213d6 100644
--- a/boxtree/tools.py
+++ b/boxtree/tools.py
@@ -29,7 +29,6 @@
 
 import pyopencl as cl
 import pyopencl.array
-import pyopencl.cltypes as cltypes
 from pyopencl.tools import ScalarArg, VectorArg as _VectorArg, dtype_to_c_struct
 from pytools import Record, memoize_method
 from pytools.obj_array import make_obj_array
@@ -68,7 +67,7 @@ def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=No
 def reverse_index_array(indices, target_size=None, result_fill_value=None,
         queue=None):
     """For an array of *indices*, return a new array *result* that satisfies
-    ``result[indices] == arange(len(indices))
+    ``result[indices] == arange(len(indices))``
 
     :arg target_n: The length of the result, or *None* if the result is to
         have the same length as *indices*.
@@ -280,18 +279,17 @@ def particle_array_to_host(parray):
 # {{{ host/device data storage
 
 class DeviceDataRecord(Record):
-    """A record of array-type data. Some of this data may live in
-    :class:`pyopencl.array.Array` objects. :meth:`get` can then be
-    called to convert all these device arrays into :mod:`numpy.ndarray`
-    instances on the host.
+    """A record of array-type data.
+
+    Some of this data may live in :class:`pyopencl.array.Array` objects.
+    :meth:`get` can then be called to convert all these device arrays into
+    :mod:`numpy.ndarray` instances on the host.
     """
 
     def _transform_arrays(self, f, exclude_fields=frozenset()):
-        result = {}
-
         def transform_val(val):
             from pyopencl.algorithm import BuiltList
-            if isinstance(val, np.ndarray) and val.dtype == object:
+            if isinstance(val, np.ndarray) and val.dtype.char == "O":
                 from pytools.obj_array import obj_array_vectorize
                 return obj_array_vectorize(f, val)
             elif isinstance(val, list):
@@ -305,7 +303,17 @@ def transform_val(val):
             else:
                 return f(val)
 
-        for field_name in self.__class__.fields:
+        from dataclasses import fields, is_dataclass
+
+        if is_dataclass(self):
+            fields = [f.name for f in fields(self)]
+        elif isinstance(self, Record):
+            fields = self.__class__.fields
+        else:
+            raise TypeError(f"unknown record type: '{type(self).__name__}'")
+
+        result = {}
+        for field_name in fields:
             if field_name in exclude_fields:
                 continue
 
@@ -319,50 +327,61 @@ def transform_val(val):
         return self.copy(**result)
 
     def get(self, queue, **kwargs):
-        """Return a copy of `self` in which all data lives on the host, i.e.
-        all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray` objects are
-        replaced by corresponding :class:`numpy.ndarray` instances on the host.
         """
+        :returns: a copy of *self* in which all data lives on the host, i.e.
+            all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray`
+            objects are replaced by corresponding :class:`numpy.ndarray`
+            instances on the host.
+        """
+        from warnings import warn
+        warn(f"{type(self).__name__}.get is deprecated and will be removed "
+            "in 2025. Switch to using arraycontext.to_numpy instead.",
+            DeprecationWarning, stacklevel=2)
+
         def try_get(attr):
             if isinstance(attr, ImmutableHostDeviceArray):
                 return attr.host
 
             try:
-                get_meth = attr.get
+                return attr.get(queue=queue, **kwargs)
             except AttributeError:
                 return attr
 
-            return get_meth(queue=queue, **kwargs)
-
         return self._transform_arrays(try_get)
 
     def with_queue(self, queue):
-        """Return a copy of `self` in
-        all :class:`pyopencl.array.Array` objects are assigned to
-        :class:`pyopencl.CommandQueue` *queue*.
         """
+        :returns: a copy of *self* in all :class:`pyopencl.array.Array` objects
+            are assigned to the :class:`pyopencl.CommandQueue` *queue*.
+        """
+        from warnings import warn
+        warn(f"{type(self).__name__}.with_queue is deprecated and will be removed "
+            "in 2025. Switch to using arraycontext.with_array_context instead.",
+            DeprecationWarning, stacklevel=2)
 
         def try_with_queue(attr):
             if isinstance(attr, cl.array.Array):
                 attr.finish()
 
             try:
-                wq_meth = attr.with_queue
+                return attr.with_queue(queue)
             except AttributeError:
                 return attr
 
-            ary = wq_meth(queue)
-            return ary
-
         return self._transform_arrays(try_with_queue)
 
     def to_device(self, queue, exclude_fields=frozenset()):
-        """Return a copy of `self` in all :class:`numpy.ndarray` arrays are
-        transferred to device memory as :class:`pyopencl.array.Array` objects.
+        """
+        :arg exclude_fields: a :class:`frozenset` containing fields excluded
+            from transferring to the device memory.
 
-        :arg exclude_fields: a :class:`frozenset` containing fields excluding from
-            transferring to the device memory.
+        :returns: a copy of *self* in all :class:`numpy.ndarray` arrays are
+            transferred to device memory as :class:`pyopencl.array.Array` objects.
         """
+        from warnings import warn
+        warn(f"{type(self).__name__}.to_device is deprecated and will be removed "
+            "in 2025. Switch to using arraycontext.from_numpy instead.",
+            DeprecationWarning, stacklevel=2)
 
         def _to_device(attr):
             if isinstance(attr, np.ndarray):
@@ -377,12 +396,18 @@ def _to_device(attr):
         return self._transform_arrays(_to_device, exclude_fields=exclude_fields)
 
     def to_host_device_array(self, queue, exclude_fields=frozenset()):
-        """Return a copy of `self` where all device and host arrays are transformed
-        to `ImmutableHostDeviceArray` objects.
+        """
+        :arg exclude_fields: a :class:`frozenset` containing fields excluded
+            from transformed to `ImmutableHostDeviceArray`.
 
-        :arg exclude_fields: a :class:`frozenset` containing fields excluding from
-            transformed to `ImmutableHostDeviceArray`.
+        :returns: a copy of *self* where all device and host arrays are
+            transformed to `ImmutableHostDeviceArray` objects.
         """
+        from warnings import warn
+        warn(f"{type(self).__name__}.to_host_device_array is deprecated and will "
+            "be removed in 2025. Switch from ImmutableHostDeviceArray.",
+            DeprecationWarning, stacklevel=2)
+
         def _to_host_device_array(attr):
             if isinstance(attr, np.ndarray | cl.array.Array):
                 return ImmutableHostDeviceArray(queue, attr)
@@ -930,6 +955,7 @@ def device(self):
 
 def get_coord_vec_dtype(
         coord_dtype: np.dtype, dimensions: int) -> np.dtype:
+    import pyopencl.cltypes as cltypes
     if dimensions == 1:
         return coord_dtype
     else:

From be48b2219dc54e0ef7c0d49966dbf97939131dc7 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Mon, 20 Jun 2022 21:42:34 +0300
Subject: [PATCH 02/28] port translation_classes to arraycontext

---
 boxtree/translation_classes.py | 107 ++++++++++++++++++---------------
 1 file changed, 58 insertions(+), 49 deletions(-)

diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py
index 21035f04..073a4a9c 100644
--- a/boxtree/translation_classes.py
+++ b/boxtree/translation_classes.py
@@ -33,24 +33,23 @@
 """
 
 import logging
+from dataclasses import dataclass
 from functools import partial
 
 import numpy as np
+from arraycontext import Array
 from mako.template import Template
 
-import pyopencl as cl
-import pyopencl.array
-import pyopencl.cltypes
-from pyopencl.elementwise import ElementwiseTemplate
-from pytools import Record, memoize_method
+from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate
+from pytools import memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 from boxtree.tools import (
-    DeviceDataRecord,
     InlineBinarySearch,
     coord_vec_subscript_code,
     get_coord_vec_dtype,
 )
-from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
+from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS, FMMTraversalInfo
 
 
 logger = logging.getLogger(__name__)
@@ -184,11 +183,14 @@
     """)
 
 
-class _KernelInfo(Record):
-    pass
+@dataclass(frozen=True)
+class _KernelInfo:
+    translation_class_finder: ElementwiseKernel
 
 
-class TranslationClassesInfo(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class TranslationClassesInfo:
     r"""Interaction lists to help with for translations that benefit from
     precomputing distance related values
 
@@ -225,13 +227,10 @@ class id for that level. Translation classes are numbered contiguously
         traversal that these translation classes refer to.
     """
 
-    def __init__(self, traversal, **kwargs):
-        super().__init__(**kwargs)
-        self.traversal = traversal
-
-    def copy(self, **kwargs):
-        traversal = kwargs.pop("traversal", self.traversal)
-        return self.__class__(traversal=traversal, **self.get_copy_kwargs(**kwargs))
+    traversal: FMMTraversalInfo
+    from_sep_siblings_translation_classes: Array
+    from_sep_siblings_translation_class_to_distance_vector: Array
+    from_sep_siblings_translation_classes_level_starts: Array
 
     @property
     def nfrom_sep_siblings_translation_classes(self):
@@ -245,12 +244,21 @@ class TranslationClassesBuilder:
     .. automethod:: __call__
     """
 
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
+        self._setup_actx = array_context
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
-    def get_kernel_info(self, dimensions, well_sep_is_n_away,
-            box_id_dtype, box_level_dtype, coord_dtype, translation_class_per_level):
+    def get_kernel_info(self,
+            dimensions: int,
+            well_sep_is_n_away: int,
+            box_id_dtype: np.dtype,
+            box_level_dtype: np.dtype,
+            coord_dtype: np.dtype,
+            translation_class_per_level) -> None:
         coord_vec_dtype = get_coord_vec_dtype(coord_dtype, dimensions)
         int_coord_vec_dtype = get_coord_vec_dtype(np.dtype(np.int32), dimensions)
 
@@ -287,11 +295,13 @@ def get_kernel_info(self, dimensions, well_sep_is_n_away,
         return _KernelInfo(translation_class_finder=translation_class_finder)
 
     @staticmethod
-    def ntranslation_classes_per_level(well_sep_is_n_away, dimensions):
+    def ntranslation_classes_per_level(
+            well_sep_is_n_away: int, dimensions: int) -> int:
         return (4 * well_sep_is_n_away + 3) ** dimensions
 
-    def translation_class_to_normalized_vector(self, well_sep_is_n_away,
-            dimensions, cls):
+    def translation_class_to_normalized_vector(
+            self, well_sep_is_n_away: int, dimensions: int, cls: type
+            ) -> np.ndarray:
         # This computes the vector for the translation class, using the inverse
         # of the formula found in get_translation_class() defined in
         # TRANSLATION_CLASS_FINDER_PREAMBLE_TEMPLATE.
@@ -303,13 +313,15 @@ def translation_class_to_normalized_vector(self, well_sep_is_n_away,
         for i in range(dimensions):
             result[i] = cls % base - shift
             cls //= base
+
         return result
 
-    def compute_translation_classes(self, queue, trav, tree, wait_for,
+    def compute_translation_classes(self,
+            actx: PyOpenCLArrayContext, trav, tree, wait_for,
             is_translation_per_level):
         """
-        Returns a tuple *evt*,  *translation_class_is_used* and
-        *translation_classes_lists*.
+        :returns: a :class:`tuple` containing *evt*, *translation_class_is_used*
+            and *translation_classes_lists*.
         """
 
         # {{{ compute translation classes for list 2
@@ -328,14 +340,11 @@ def compute_translation_classes(self, queue, trav, tree, wait_for,
         if is_translation_per_level:
             ntranslation_classes = ntranslation_classes * tree.nlevels
 
-        translation_classes_lists = cl.array.empty(
-                queue, len(trav.from_sep_siblings_lists), dtype=np.int32)
-
-        translation_class_is_used = cl.array.zeros(
-                queue, ntranslation_classes, dtype=np.int32)
-
-        error_flag = cl.array.zeros(queue, 1, dtype=np.int32)
+        translation_classes_lists = actx.np.zeros(
+            len(trav.from_sep_siblings_lists), dtype=np.int32)
+        translation_class_is_used = actx.zeros(ntranslation_classes, dtype=np.int32)
 
+        error_flag = actx.zeros(1, dtype=np.int32)
         evt = knl_info.translation_class_finder(
                 trav.from_sep_siblings_lists,
                 trav.from_sep_siblings_starts,
@@ -349,9 +358,10 @@ def compute_translation_classes(self, queue, trav, tree, wait_for,
                 translation_classes_lists,
                 translation_class_is_used,
                 error_flag,
-                queue=queue, wait_for=wait_for)
+                queue=actx.queue,
+                wait_for=wait_for)
 
-        if (error_flag.get()):
+        if actx.to_numpy(error_flag)[0]:
             raise ValueError("could not compute translation classes")
 
         return (evt, translation_class_is_used, translation_classes_lists)
@@ -359,13 +369,13 @@ def compute_translation_classes(self, queue, trav, tree, wait_for,
         # }}}
 
     @log_process(logger, "build m2l translation classes")
-    def __call__(self, queue, trav, tree, wait_for=None,
-                 is_translation_per_level=True):
+    def __call__(self, actx: PyOpenCLArrayContext,
+            trav, tree, wait_for=None, is_translation_per_level=True):
         """Returns a pair *info*, *evt* where info is a
         :class:`TranslationClassesInfo`.
         """
         evt, translation_class_is_used, translation_classes_lists = \
-            self.compute_translation_classes(queue, trav, tree, wait_for,
+            self.compute_translation_classes(actx, trav, tree, wait_for,
                                              is_translation_per_level)
 
         well_sep_is_n_away = trav.well_sep_is_n_away
@@ -385,7 +395,7 @@ def __call__(self, queue, trav, tree, wait_for=None,
         prev_level = -1
         from_sep_siblings_translation_classes_level_starts = \
             np.empty(nlevels+1, dtype=np.int32)
-        for i, used in enumerate(translation_class_is_used.get()):
+        for i, used in enumerate(actx.to_numpy(translation_class_is_used)):
             cls_without_level = i % num_translation_classes
             level = i // num_translation_classes
             if (prev_level != level):
@@ -403,14 +413,13 @@ def __call__(self, queue, trav, tree, wait_for=None,
 
         from_sep_siblings_translation_classes_level_starts[nlevels] = count
 
-        translation_classes_lists = (
-                cl.array.take(
-                    cl.array.to_device(queue, used_translation_classes_map),
-                    translation_classes_lists))
+        translation_classes_lists = actx.from_numpy(
+            used_translation_classes_map
+            )[translation_classes_lists]
 
-        distances = cl.array.to_device(queue, distances)
-        from_sep_siblings_translation_classes_level_starts = cl.array.to_device(
-            queue, from_sep_siblings_translation_classes_level_starts)
+        distances = actx.from_numpy(distances)
+        from_sep_siblings_translation_classes_level_starts = actx.from_numpy(
+            from_sep_siblings_translation_classes_level_starts)
 
         info = TranslationClassesInfo(
                 traversal=trav,
@@ -418,9 +427,9 @@ def __call__(self, queue, trav, tree, wait_for=None,
                 from_sep_siblings_translation_class_to_distance_vector=distances,
                 from_sep_siblings_translation_classes_level_starts=(
                     from_sep_siblings_translation_classes_level_starts),
-                ).with_queue(None)
+                )
 
-        return info, evt
+        return actx.freeze(info), evt
 
 # }}}
 

From 959d3a7706cd1f42faf5c1e9090c95db36e70df1 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Mon, 20 Jun 2022 21:42:49 +0300
Subject: [PATCH 03/28] port rotation_classes to arraycontext

---
 boxtree/rotation_classes.py | 57 +++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py
index 22c2b6b5..43e1b759 100644
--- a/boxtree/rotation_classes.py
+++ b/boxtree/rotation_classes.py
@@ -33,24 +33,25 @@
 """
 
 import logging
+from dataclasses import dataclass
 
 import numpy as np
+from arraycontext import Array
 
-import pyopencl as cl
-import pyopencl.array
+from pytools import log_process
 
-from boxtree.tools import DeviceDataRecord
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 from boxtree.translation_classes import TranslationClassesBuilder
 
 
 logger = logging.getLogger(__name__)
 
-from pytools import log_process
-
 
 # {{{ rotation classes builder
 
-class RotationClassesInfo(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class RotationClassesInfo:
     r"""Interaction lists to help with matrix precomputations for rotation-based
     translations ("point and shoot").
 
@@ -75,6 +76,9 @@ class RotationClassesInfo(DeviceDataRecord):
 
     """
 
+    from_sep_siblings_rotation_classes: Array
+    from_sep_siblings_rotation_class_to_angle: Array
+
     @property
     def nfrom_sep_siblings_rotation_classes(self):
         return len(self.from_sep_siblings_rotation_class_to_angle)
@@ -87,25 +91,24 @@ class RotationClassesBuilder:
     .. automethod:: __call__
     """
 
-    def __init__(self, context):
-        self.context = context
-        self.tcb = TranslationClassesBuilder(context)
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+        self.tcb = TranslationClassesBuilder(array_context)
 
     @staticmethod
-    def vec_gcd(vec):
+    def vec_gcd(vec) -> int:
         """Return the GCD of a list of integers."""
-        def gcd(a, b):
-            while b:
-                a, b = b, a % b
-            return a
+        import math
 
+        # TODO: math.gcd supports a list of integers from >= 3.9
         result = abs(vec[0])
         for elem in vec[1:]:
-            result = gcd(result, abs(elem))
+            result = math.gcd(result, abs(elem))
+
         return result
 
     def compute_rotation_classes(self,
-            well_sep_is_n_away, dimensions, used_translation_classes):
+            well_sep_is_n_away: int, dimensions: int, used_translation_classes):
         """Convert translation classes to a list of rotation classes and angles."""
         angle_to_rot_class = {}
         angles = []
@@ -154,11 +157,11 @@ def compute_rotation_classes(self,
         return translation_class_to_rot_class, angles
 
     @log_process(logger, "build m2l rotation classes")
-    def __call__(self, queue, trav, tree, wait_for=None):
+    def __call__(self, actx, trav, tree, wait_for=None):
         """Returns a pair *info*, *evt* where info is a :class:`RotationClassesInfo`.
         """
         evt, translation_class_is_used, translation_classes_lists = \
-            self.tcb.compute_translation_classes(queue, trav, tree, wait_for, False)
+            self.tcb.compute_translation_classes(actx, trav, tree, wait_for, False)
 
         d = tree.dimensions
         n = trav.well_sep_is_n_away
@@ -166,7 +169,7 @@ def __call__(self, queue, trav, tree, wait_for=None):
         # convert translation classes to rotation classes
 
         used_translation_classes = (
-                np.flatnonzero(translation_class_is_used.get()))
+                np.flatnonzero(actx.to_numpy(translation_class_is_used)))
 
         translation_class_to_rotation_class, rotation_angles = (
                 self.compute_rotation_classes(n, d, used_translation_classes))
@@ -176,17 +179,17 @@ def __call__(self, queue, trav, tree, wait_for=None):
         # positions for list 2 boxes.
         assert len(rotation_angles) <= 2**(d-1) * (2*n+1)**d
 
-        rotation_classes_lists = (
-                cl.array.take(
-                    cl.array.to_device(queue, translation_class_to_rotation_class),
-                    translation_classes_lists))
-
-        rotation_angles = cl.array.to_device(queue, np.array(rotation_angles))
+        rotation_classes_lists = actx.from_numpy(
+            translation_class_to_rotation_class
+            )[translation_classes_lists]
+        rotation_angles = actx.from_numpy(np.array(rotation_angles))
 
-        return RotationClassesInfo(
+        info = RotationClassesInfo(
                 from_sep_siblings_rotation_classes=rotation_classes_lists,
                 from_sep_siblings_rotation_class_to_angle=rotation_angles,
-                ).with_queue(None), evt
+                )
+
+        return actx.freeze(info), evt
 
 # }}}
 

From bac678dad9f83c0f9b94bb2508aed042e0b9d244 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Mon, 20 Jun 2022 21:58:11 +0300
Subject: [PATCH 04/28] port area_query to arraycontext

---
 boxtree/area_query.py | 214 ++++++++++++++++++++++--------------------
 1 file changed, 111 insertions(+), 103 deletions(-)

diff --git a/boxtree/area_query.py b/boxtree/area_query.py
index 6804afd1..23f78ba2 100644
--- a/boxtree/area_query.py
+++ b/boxtree/area_query.py
@@ -24,22 +24,23 @@
 
 
 import logging
+from dataclasses import dataclass
 from functools import partial
 
 import numpy as np
+from arraycontext import Array
 from mako.template import Template
 
-import pyopencl as cl
-import pyopencl.array
-import pyopencl.cltypes
+from pyopencl.elementwise import ElementwiseTemplate
 from pytools import ProcessLogger, memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 from boxtree.tools import (
-    AXIS_NAMES,
-    DeviceDataRecord,
+    InlineBinarySearch,
     coord_vec_subscript_code,
     get_coord_vec_dtype,
 )
+from boxtree.tree import Tree
 
 
 logger = logging.getLogger(__name__)
@@ -82,7 +83,9 @@
 
 # {{{ output
 
-class PeerListLookup(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class PeerListLookup:
     """
     .. attribute:: tree
 
@@ -96,13 +99,17 @@ class PeerListLookup(DeviceDataRecord):
 
     .. attribute:: peer_lists
 
-    .. automethod:: get
-
     .. versionadded:: 2016.1
     """
 
+    tree: Tree
+    peer_list_starts: Array
+    peer_lists: Array
+
 
-class AreaQueryResult(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class AreaQueryResult:
     """
     .. attribute:: tree
 
@@ -117,13 +124,17 @@ class AreaQueryResult(DeviceDataRecord):
 
     .. attribute:: leaves_near_ball_lists
 
-    .. automethod:: get
-
     .. versionadded:: 2016.1
     """
 
+    tree: Tree
+    leaves_near_ball_starts: Array
+    leaves_near_ball_lists: Array
+
 
-class LeavesToBallsLookup(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class LeavesToBallsLookup:
     """
     .. attribute:: tree
 
@@ -140,10 +151,12 @@ class LeavesToBallsLookup(DeviceDataRecord):
             this list is indexed by the global box index.
 
     .. attribute:: balls_near_box_lists
-
-    .. automethod:: get
     """
 
+    tree: Tree
+    balls_near_box_starts: Array
+    balls_near_box_lists: Array
+
 # }}}
 
 
@@ -454,12 +467,6 @@ class LeavesToBallsLookup(DeviceDataRecord):
 
 """
 
-
-from pyopencl.elementwise import ElementwiseTemplate
-
-from boxtree.tools import InlineBinarySearch
-
-
 STARTS_EXPANDER_TEMPLATE = ElementwiseTemplate(
     arguments=r"""
         idx_t *dst,
@@ -546,6 +553,7 @@ def generate(self, context,
         from pyopencl.tools import dtype_to_ctype
 
         from boxtree import box_flags_enum
+        from boxtree.tools import AXIS_NAMES
         from boxtree.traversal import TRAVERSAL_PREAMBLE_TYPEDEFS_AND_DEFINES
         from boxtree.tree_build import TreeBuilder
         render_vars = (
@@ -648,9 +656,13 @@ class AreaQueryBuilder:
     .. automethod:: __init__
     .. automethod:: __call__
     """
-    def __init__(self, context):
-        self.context = context
-        self.peer_list_finder = PeerListFinder(self.context)
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+        self.peer_list_finder = PeerListFinder(array_context)
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     # {{{ Kernel generation
 
@@ -660,12 +672,12 @@ def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype,
         from pyopencl.tools import dtype_to_ctype
 
         from boxtree import box_flags_enum
-
-        logger.debug("start building area query kernel")
-
+        from boxtree.tools import AXIS_NAMES
         from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE
         from boxtree.tree_build import TreeBuilder
 
+        logger.debug("start building area query kernel")
+
         template = Template(
             TRAVERSAL_PREAMBLE_TEMPLATE
             + AREA_QUERY_TEMPLATE,
@@ -722,20 +734,14 @@ def get_area_query_kernel(self, dimensions, coord_dtype, box_id_dtype,
 
     # }}}
 
-    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                 ball_centers, ball_radii, peer_lists=None,
                  wait_for=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue`
-        :arg tree: a :class:`boxtree.Tree`.
-        :arg ball_centers: an object array of coordinate
-            :class:`pyopencl.array.Array` instances.
-            Their *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: a
-            :class:`pyopencl.array.Array`
-            of positive numbers.
-            Its *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_centers: an object array of coordinates. Their *dtype* must
+            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_radii: an array of positive numbers. Its *dtype* must match
+            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
         :arg peer_lists: may either be *None* or an instance of
             :class:`PeerListLookup` associated with `tree`.
         :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
@@ -760,7 +766,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
         max_levels = div_ceil(tree.nlevels, 10) * 10
 
         if peer_lists is None:
-            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
+            peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for)
             wait_for = [evt]
 
         if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
@@ -773,7 +779,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
         aq_plog = ProcessLogger(logger, "area query")
 
         result, evt = area_query_kernel(
-                queue, len(ball_radii),
+                actx.queue, len(ball_radii),
                 tree.box_centers.data, tree.root_extent,
                 tree.box_levels, tree.aligned_nboxes,
                 tree.box_child_ids.data, tree.box_flags,
@@ -785,10 +791,12 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
 
         aq_plog.done()
 
-        return AreaQueryResult(
+        result = AreaQueryResult(
                 tree=tree,
                 leaves_near_ball_starts=result["leaves"].starts,
-                leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt
+                leaves_near_ball_lists=result["leaves"].lists)
+
+        return actx.freeze(result), evt
 
 # }}}
 
@@ -803,12 +811,16 @@ class LeavesToBallsLookupBuilder:
     .. automethod:: __call__
 
     """
-    def __init__(self, context):
-        self.context = context
-
+    def __init__(self, array_context: PyOpenCLArrayContext):
         from pyopencl.algorithm import KeyValueSorter
-        self.key_value_sorter = KeyValueSorter(context)
-        self.area_query_builder = AreaQueryBuilder(context)
+
+        self._setup_actx = array_context
+        self.key_value_sorter = KeyValueSorter(self.context)
+        self.area_query_builder = AreaQueryBuilder(array_context)
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
     def get_starts_expander_kernel(self, idx_dtype):
@@ -823,20 +835,14 @@ def get_starts_expander_kernel(self, idx_dtype):
                 self.context,
                 type_aliases=(("idx_t", idx_dtype),))
 
-    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                 ball_centers, ball_radii, peer_lists=None,
                  wait_for=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue`
-        :arg tree: a :class:`boxtree.Tree`.
-        :arg ball_centers: an object array of coordinate
-            :class:`pyopencl.array.Array` instances.
-            Their *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: a
-            :class:`pyopencl.array.Array`
-            of positive numbers.
-            Its *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_centers: an object array of coordinates. Their *dtype* must
+            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_radii: an array of positive numbers. Its *dtype* must match
+            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
         :arg peer_lists: may either be *None* or an instance of
             :class:`PeerListLookup` associated with `tree`.
         :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
@@ -856,7 +862,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
         ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query")
 
         area_query, evt = self.area_query_builder(
-                queue, tree, ball_centers, ball_radii, peer_lists, wait_for)
+                actx, tree, ball_centers, ball_radii, peer_lists, wait_for)
         wait_for = [evt]
 
         logger.debug("leaves-to-balls lookup: expand starts")
@@ -873,11 +879,11 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
         # 2. Key-value sort the (ball number, box number) pairs by box number.
 
         starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype)
-        expanded_starts = cl.array.empty(
-                queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
+        expanded_starts = actx.np.zeros(
+                len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
         evt = starts_expander_knl(
                 expanded_starts,
-                area_query.leaves_near_ball_starts.with_queue(queue),
+                area_query.leaves_near_ball_starts,
                 nballs_p_1)
         wait_for = [evt]
 
@@ -885,20 +891,21 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
 
         balls_near_box_starts, balls_near_box_lists, evt \
                 = self.key_value_sorter(
-                        queue,
+                        actx.queue,
                         # keys
-                        area_query.leaves_near_ball_lists.with_queue(queue),
+                        area_query.leaves_near_ball_lists,
                         # values
                         expanded_starts,
                         nkeys, starts_dtype=tree.box_id_dtype,
                         wait_for=wait_for)
-
         ltb_plog.done()
 
-        return LeavesToBallsLookup(
+        lookup = LeavesToBallsLookup(
                 tree=tree,
                 balls_near_box_starts=balls_near_box_starts,
-                balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
+                balls_near_box_lists=balls_near_box_lists)
+
+        return actx.freeze(lookup), evt
 
 # }}}
 
@@ -927,9 +934,13 @@ class SpaceInvaderQueryBuilder:
     .. automethod:: __call__
 
     """
-    def __init__(self, context):
-        self.context = context
-        self.peer_list_finder = PeerListFinder(self.context)
+    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
+        self._setup_actx = array_context
+        self.peer_list_finder = PeerListFinder(array_context)
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     # {{{ Kernel generation
 
@@ -946,30 +957,23 @@ def get_space_invader_query_kernel(self, dimensions, coord_dtype,
 
     # }}}
 
-    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                 ball_centers, ball_radii, peer_lists=None,
                  wait_for=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue`
-        :arg tree: a :class:`boxtree.Tree`.
-        :arg ball_centers: an object array of coordinate
-            :class:`pyopencl.array.Array` instances.
-            Their *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
-        :arg ball_radii: a
-            :class:`pyopencl.array.Array`
-            of positive numbers.
-            Its *dtype* must match *tree*'s
-            :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_centers: an object array of coordinates. Their *dtype* must
+            match *tree*'s :attr:`boxtree.Tree.coord_dtype`.
+        :arg ball_radii: an array of positive numbers. Its *dtype* must match
+            *tree*'s :attr:`boxtree.Tree.coord_dtype`.
         :arg peer_lists: may either be *None* or an instance of
-            :class:`PeerListLookup` associated with `tree`.
+            :class:`PeerListLookup` associated with *tree*.
         :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
             instances for whose completion this command waits before starting
             execution.
-        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
-            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
-            for dependency management. The *dtype* of *sqi* is
-            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
-            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
+        :returns: a tuple *(sqi, event)*, where *sqi* is an array and *event*
+            is a :class:`pyopencl.Event` for dependency management. The *dtype*
+            of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape
+            is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
             The entries of *sqi* are indexed by the global box index and are
             as follows:
 
@@ -990,7 +994,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
         max_levels = div_ceil(tree.nlevels, 10) * 10
 
         if peer_lists is None:
-            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
+            peer_lists, evt = self.peer_list_finder(actx, tree, wait_for=wait_for)
             wait_for = [evt]
 
         if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
@@ -1002,7 +1006,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
 
         si_plog = ProcessLogger(logger, "space invader query")
 
-        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32)
+        outer_space_invader_dists = actx.zeros(tree.nboxes, np.float32)
         if not wait_for:
             wait_for = []
         wait_for = (wait_for
@@ -1017,7 +1021,7 @@ def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                     outer_space_invader_dists,
                     *tuple(bc for bc in ball_centers)),
                 wait_for=wait_for,
-                queue=queue,
+                queue=actx.queue,
                 range=slice(len(ball_radii)))
 
         if tree.coord_dtype != np.dtype(np.float32):
@@ -1062,8 +1066,12 @@ class PeerListFinder:
     .. automethod:: __call__
     """
 
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     # {{{ Kernel generation
 
@@ -1073,14 +1081,14 @@ def get_peer_list_finder_kernel(self, dimensions, coord_dtype,
         from pyopencl.tools import dtype_to_ctype
 
         from boxtree import box_flags_enum
-
-        logger.debug("start building peer list finder kernel")
-
+        from boxtree.tools import AXIS_NAMES
         from boxtree.traversal import (
             HELPER_FUNCTION_TEMPLATE,
             TRAVERSAL_PREAMBLE_TEMPLATE,
         )
 
+        logger.debug("start building peer list finder kernel")
+
         template = Template(
             TRAVERSAL_PREAMBLE_TEMPLATE
             + HELPER_FUNCTION_TEMPLATE
@@ -1130,10 +1138,8 @@ def get_peer_list_finder_kernel(self, dimensions, coord_dtype,
 
     # }}}
 
-    def __call__(self, queue, tree, wait_for=None):
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree, wait_for=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue`
-        :arg tree: a :class:`boxtree.Tree`.
         :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
             instances for whose completion this command waits before starting
             execution.
@@ -1153,7 +1159,7 @@ def __call__(self, queue, tree, wait_for=None):
         pl_plog = ProcessLogger(logger, "find peer lists")
 
         result, evt = peer_list_finder_kernel(
-                queue, tree.nboxes,
+                actx.queue, tree.nboxes,
                 tree.box_centers.data, tree.root_extent,
                 tree.box_levels, tree.aligned_nboxes,
                 tree.box_child_ids.data, tree.box_flags,
@@ -1161,10 +1167,12 @@ def __call__(self, queue, tree, wait_for=None):
 
         pl_plog.done()
 
-        return PeerListLookup(
+        lookup = PeerListLookup(
                 tree=tree,
                 peer_list_starts=result["peers"].starts,
-                peer_lists=result["peers"].lists).with_queue(None), evt
+                peer_lists=result["peers"].lists)
+
+        return actx.freeze(lookup), evt
 
 # }}}
 

From ca41e31afd5d5055e4ea18ee5c324c04ec4f1883 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Tue, 21 Jun 2022 20:06:48 +0300
Subject: [PATCH 05/28] port bounding_box to array_context

---
 boxtree/bounding_box.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/boxtree/bounding_box.py b/boxtree/bounding_box.py
index 77cc51ce..804820a6 100644
--- a/boxtree/bounding_box.py
+++ b/boxtree/bounding_box.py
@@ -20,13 +20,12 @@
 THE SOFTWARE.
 """
 
-
 import numpy as np
 
-import pyopencl as cl  # noqa
 from pyopencl.reduction import ReductionTemplate
 from pytools import memoize, memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.tools import get_type_moniker
 
 
@@ -121,17 +120,22 @@ def make_bounding_box_dtype(device, dimensions, coord_dtype):
 
 
 class BoundingBoxFinder:
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
 
-        for dev in context.devices:
+        for dev in self.context.devices:
             if (dev.vendor == "Intel(R) Corporation"
                     and dev.version == "OpenCL 1.2 (Build 56860)"):
                 raise RuntimeError("bounding box finder does not work "
                         "properly with this CL runtime.")
 
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
+
     @memoize_method
     def get_kernel(self, dimensions, coord_dtype, have_radii):
+        # FIXME: Why does this just use `devices[0]`?
         bbox_dtype, _bbox_cdecl = make_bounding_box_dtype(
                 self.context.devices[0], dimensions, coord_dtype)
 
@@ -152,18 +156,18 @@ def get_kernel(self, dimensions, coord_dtype, have_radii):
                     )
                 )
 
-    def __call__(self, particles, radii, wait_for=None):
+    def __call__(self, actx, particles, radii, wait_for=None):
         dimensions = len(particles)
 
         from pytools import single_valued
         coord_dtype = single_valued(coord.dtype for coord in particles)
-
         radii_tuple = () if radii is None else (radii,)
-        knl = self.get_kernel(dimensions, coord_dtype,
-                # have_radii:
-                radii is not None)
-        return knl(*(tuple(particles) + radii_tuple),
-                wait_for=wait_for, return_event=True)
+
+        knl = self.get_kernel(dimensions, coord_dtype, have_radii=radii is not None)
+        return knl(
+            *(tuple(particles) + radii_tuple),
+            queue=actx.queue,
+            wait_for=wait_for, return_event=True)
 
 # }}}
 

From 4355c48d6082495e69cebe4729be9dc5732d7b4c Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Tue, 21 Jun 2022 20:13:42 +0300
Subject: [PATCH 06/28] port fmm to array_context

---
 boxtree/fmm.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/boxtree/fmm.py b/boxtree/fmm.py
index d1eacd9a..3c4da9a1 100644
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
@@ -30,14 +30,15 @@
 import logging
 from abc import ABC, abstractmethod
 
-
-logger = logging.getLogger(__name__)
 from pytools import ProcessLogger
 
 from boxtree.traversal import FMMTraversalInfo
 from boxtree.tree import Tree
 
 
+logger = logging.getLogger(__name__)
+
+
 # {{{ expansion wrangler interface
 
 class TreeIndependentDataForWrangler:
@@ -113,8 +114,9 @@ class ExpansionWranglerInterface(ABC):
     .. automethod:: finalize_potentials
     """
 
-    def __init__(self, tree_indep: TreeIndependentDataForWrangler,
-            traversal: FMMTraversalInfo):
+    def __init__(self,
+            tree_indep: TreeIndependentDataForWrangler,
+            traversal: FMMTraversalInfo) -> None:
         self.tree_indep = tree_indep
         self.traversal = traversal
 
@@ -264,7 +266,7 @@ def finalize_potentials(self, potentials, template_ary):
             :class:`boxtree.pyfmmlib_integration.FMMLibExpansionWrangler`
             uses :class:`numpy.ndarray` internally, this array can be used
             to help convert the output back to the user's array
-            type (typically :class:`pyopencl.array.Array`).
+            type.
         """
 
     def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
@@ -368,8 +370,8 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # Interface guidelines: Attributes of the tree are assumed to be known
     # to the expansion wrangler and should not be passed.
 
-    fmm_proc = ProcessLogger(logger, "fmm")
     from boxtree.timing import TimingRecorder
+    fmm_proc = ProcessLogger(logger, "fmm")
     recorder = TimingRecorder()
 
     src_weight_vecs = [wrangler.reorder_sources(weight) for

From 779a377920df6f9e8783aa2d22b5e5597bedf39d Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Tue, 21 Jun 2022 20:47:47 +0300
Subject: [PATCH 07/28] port traversal to arraycontext

---
 boxtree/traversal.py | 298 ++++++++++++++++++++++++-------------------
 1 file changed, 166 insertions(+), 132 deletions(-)

diff --git a/boxtree/traversal.py b/boxtree/traversal.py
index f8f3d3b6..f1fd5ecf 100644
--- a/boxtree/traversal.py
+++ b/boxtree/traversal.py
@@ -34,30 +34,27 @@
 THE SOFTWARE.
 """
 
+import enum
 import logging
+from dataclasses import dataclass
 from functools import partial
 
 import numpy as np
+from arraycontext import Array
 from mako.template import Template
 
-import pyopencl as cl
-import pyopencl.array
-import pyopencl.cltypes
-from pyopencl.elementwise import ElementwiseTemplate
-from pytools import Record, memoize_method
+from pyopencl.algorithm import ListOfListsBuilder
+from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate
+from pytools import ProcessLogger, log_process, memoize_method
+from pytools.obj_array import make_obj_array
 
-from boxtree.tools import (
-    AXIS_NAMES,
-    DeviceDataRecord,
-    coord_vec_subscript_code,
-    get_coord_vec_dtype,
-)
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
+from boxtree.tools import AXIS_NAMES, coord_vec_subscript_code, get_coord_vec_dtype
+from boxtree.tree import Tree
 
 
 logger = logging.getLogger(__name__)
 
-from pytools import ProcessLogger, log_process
-
 
 # {{{ preamble
 
@@ -1182,7 +1179,7 @@
     name="merge_lists")
 
 
-class _IndexStyle:
+class _IndexStyle(enum.IntEnum):
     TARGET_BOXES = 0
     TARGET_OR_TARGET_PARENT_BOXES = 1
 
@@ -1190,10 +1187,14 @@ class _IndexStyle:
 class _ListMerger:
     """Utility class for combining box lists optionally changing indexing style."""
 
-    def __init__(self, context, box_id_dtype):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype):
+        self._setup_actx = array_context
         self.box_id_dtype = box_id_dtype
 
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
+
     @memoize_method
     def get_list_merger_kernel(self, nlists, write_counts):
         """
@@ -1213,7 +1214,7 @@ def get_list_merger_kernel(self, nlists, write_counts):
                     ("write_counts", write_counts),
                 ))
 
-    def __call__(self, queue, input_starts, input_lists, input_index_style,
+    def __call__(self, actx, input_starts, input_lists, input_index_style,
             output_index_style, target_boxes, target_or_target_parent_boxes,
             nboxes, debug=False, wait_for=None):
         """
@@ -1246,18 +1247,18 @@ def __call__(self, queue, input_starts, input_lists, input_index_style,
                 and output_index_style == _IndexStyle.TARGET_BOXES):
             from boxtree.tools import reverse_index_array
             target_or_target_parent_boxes_from_all_boxes = reverse_index_array(
-                    target_or_target_parent_boxes, target_size=nboxes,
-                    queue=queue)
-            target_or_target_parent_boxes_from_target_boxes = cl.array.take(
-                    target_or_target_parent_boxes_from_all_boxes,
-                    target_boxes, queue=queue)
+                    actx, target_or_target_parent_boxes, target_size=nboxes)
+            target_or_target_parent_boxes_from_target_boxes = (
+                    target_or_target_parent_boxes_from_all_boxes[target_boxes]
+                    )
 
             output_to_input_box = target_or_target_parent_boxes_from_target_boxes
         else:
-            output_to_input_box = cl.array.arange(
-                    queue, noutput_boxes, dtype=self.box_id_dtype)
+            output_to_input_box = actx.from_numpy(
+                    np.arange(noutput_boxes, dtype=self.box_id_dtype)
+                    )
 
-        new_counts = cl.array.empty(queue, noutput_boxes+1, self.box_id_dtype)
+        new_counts = actx.np.zeros(noutput_boxes + 1, self.box_id_dtype)
 
         assert len(input_starts) == len(input_lists)
         nlists = len(input_starts)
@@ -1269,17 +1270,14 @@ def __call__(self, queue, input_starts, input_lists, input_index_style,
                     # output:
                     new_counts,
                     range=slice(noutput_boxes),
-                    queue=queue,
+                    queue=actx.queue,
                     wait_for=wait_for)
 
-        new_starts = cl.array.cumsum(new_counts)
+        import pyopencl.array as cl_array
+        new_starts = cl_array.cumsum(new_counts)
         del new_counts
 
-        new_lists = cl.array.empty(
-                queue,
-                int(new_starts[-1].get()),
-                self.box_id_dtype)
-
+        new_lists = actx.np.zeros(int(actx.to_numpy(new_starts[-1])), self.box_id_dtype)
         new_lists.fill(999999999)
 
         evt = self.get_list_merger_kernel(nlists, False)(
@@ -1291,7 +1289,7 @@ def __call__(self, queue, input_starts, input_lists, input_index_style,
                     # output:
                     new_lists,
                     range=slice(noutput_boxes),
-                    queue=queue,
+                    queue=actx.queue,
                     wait_for=[evt])
 
         return {"starts": new_starts, "lists": new_lists}, evt
@@ -1301,7 +1299,9 @@ def __call__(self, queue, input_starts, input_lists, input_index_style,
 
 # {{{ traversal info (output)
 
-class FMMTraversalInfo(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class FMMTraversalInfo:
     r"""Interaction lists needed for a fast-multipole-like linear-time gather of
     particle interactions.
 
@@ -1312,9 +1312,6 @@ class FMMTraversalInfo(DeviceDataRecord):
         Scientific and Statistical Computing 9, no. 4 (July 1988): 669-686.
         `DOI: 10.1137/0909044 <https://dx.doi.org/10.1137/0909044>`__.
 
-    Unless otherwise indicated, all bulk data in this data structure is stored
-    in a :class:`pyopencl.array.Array`. See also :meth:`get`.
-
     .. attribute:: tree
 
         An instance of :class:`boxtree.Tree`.
@@ -1421,16 +1418,6 @@ class FMMTraversalInfo(DeviceDataRecord):
 
         ``box_id_t [*]``
 
-    Following attributes are deprecated.
-
-    .. attribute:: colleagues_starts
-
-        ``box_id_t [nboxes+1]``
-
-    .. attribute:: colleagues_lists
-
-        ``box_id_t [*]``
-
     .. ------------------------------------------------------------------------
     .. rubric:: Neighbor Sources ("List 1")
     .. ------------------------------------------------------------------------
@@ -1552,15 +1539,65 @@ class FMMTraversalInfo(DeviceDataRecord):
         Changed index style of *from_sep_close_bigger_starts* from
         :attr:`target_or_target_parent_boxes` to :attr:`target_boxes`.
 
-
-    .. automethod:: get
-
     .. automethod:: merge_close_lists
     """
 
+    tree: Tree
+    well_sep_is_n_away: int
+
+    # basic box lists for iteration
+    source_boxes: Array
+    target_boxes: Array
+    level_start_source_box_nrs: Array
+    level_start_target_box_nrs: Array
+    source_parent_boxes: Array
+    level_start_source_parent_box_nrs: Array
+    target_or_target_parent_boxes: Array
+    level_start_target_or_target_parent_box_nrs: Array
+
+    # same-level non-well-separated boxes
+    same_level_non_well_sep_boxes_starts: Array
+    same_level_non_well_sep_boxes_lists: Array
+
+    # neighbor sources ("List 1")
+    neighbor_source_boxes_starts: Array
+    neighbor_source_boxes_lists: Array
+
+    # separated siblings ("List 2")
+    from_sep_siblings_starts: Array
+    from_sep_siblings_lists: Array
+
+    # separated smaller boxes ("List 3")
+    from_sep_smaller_by_level: Array
+    target_boxes_sep_smaller_by_source_level: Array
+    from_sep_close_smaller_starts: Array
+    from_sep_close_smaller_lists: Array
+
+    # separated bigger boxes ("List 4")
+    from_sep_bigger_starts: Array
+    from_sep_bigger_lists: Array
+    from_sep_close_bigger_starts: Array
+    from_sep_close_bigger_lists: Array
+
+    @property
+    def nboxes(self):
+        return self.tree.nboxes
+
+    @property
+    def nlevels(self):
+        return self.tree.nlevels
+
+    @property
+    def ntarget_boxes(self):
+        return len(self.target_boxes)
+
+    @property
+    def ntarget_or_target_parent_boxes(self):
+        return len(self.target_or_target_parent_boxes)
+
     # {{{ "close" list merging -> "unified list 1"
 
-    def merge_close_lists(self, queue, debug=False):
+    def merge_close_lists(self, actx, debug=False):
         """Return a new :class:`FMMTraversalInfo` instance with the contents of
         :attr:`from_sep_close_smaller_starts` and
         :attr:`from_sep_close_bigger_starts` merged into
@@ -1568,11 +1605,11 @@ def merge_close_lists(self, queue, debug=False):
         *None*.
         """
 
-        list_merger = _ListMerger(queue.context, self.tree.box_id_dtype)
+        list_merger = _ListMerger(actx, self.tree.box_id_dtype)
 
         result, evt = (
                 list_merger(
-                    queue,
+                    actx,
                     # starts
                     (self.neighbor_source_boxes_starts,
                      self.from_sep_close_smaller_starts,
@@ -1591,11 +1628,13 @@ def merge_close_lists(self, queue, debug=False):
                     self.tree.nboxes,
                     debug))
 
+        import pyopencl as cl
         cl.wait_for_events([evt])
 
-        return self.copy(
-                neighbor_source_boxes_starts=result["starts"].with_queue(None),
-                neighbor_source_boxes_lists=result["lists"].with_queue(None),
+        from dataclasses import replace
+        return replace(self,
+                neighbor_source_boxes_starts=actx.freeze(result["starts"]),
+                neighbor_source_boxes_lists=actx.freeze(result["lists"]),
                 from_sep_close_smaller_starts=None,
                 from_sep_close_smaller_lists=None,
                 from_sep_close_bigger_starts=None,
@@ -1606,34 +1645,25 @@ def merge_close_lists(self, queue, debug=False):
     # {{{ debugging aids
 
     def get_box_list(self, what, index):
-        starts = getattr(self, what+"_starts")
-        lists = getattr(self, what+"_lists")
+        starts = getattr(self, f"{what}_starts")
+        lists = getattr(self, f"{what}_lists")
         start, stop = starts[index:index+2]
         return lists[start:stop]
 
     # }}}
 
-    @property
-    def nboxes(self):
-        return self.tree.nboxes
-
-    @property
-    def nlevels(self):
-        return self.tree.nlevels
-
-    @property
-    def ntarget_boxes(self):
-        return len(self.target_boxes)
-
-    @property
-    def ntarget_or_target_parent_boxes(self):
-        return len(self.target_or_target_parent_boxes)
-
 # }}}
 
 
-class _KernelInfo(Record):
-    pass
+@dataclass(frozen=True)
+class _KernelInfo:
+    sources_parents_and_targets_builder: ListOfListsBuilder
+    level_start_box_nrs_extractor: ElementwiseKernel
+    same_level_non_well_sep_boxes_builder: ListOfListsBuilder
+    neighbor_source_boxes_builder: ListOfListsBuilder
+    from_sep_siblings_builder: ListOfListsBuilder
+    from_sep_smaller_builder: ListOfListsBuilder
+    from_sep_bigger_builder: ListOfListsBuilder
 
 
 class FMMTraversalBuilder:
@@ -1641,7 +1671,9 @@ class FMMTraversalBuilder:
     .. automethod:: __init__
     """
 
-    def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None):
+    def __init__(self, array_context: PyOpenCLArrayContext, *,
+            well_sep_is_n_away=1,
+            from_sep_smaller_crit=None) -> None:
         """
         :arg well_sep_is_n_away: Either An integer 1 or greater.
             (Only 1 and 2 are tested.)
@@ -1657,10 +1689,14 @@ def __init__(self, context, well_sep_is_n_away=1, from_sep_smaller_crit=None):
             including their radii), or ``"static_l2"`` (use the circumcircle of
             the box, possibly enlarged by :attr:`boxtree.Tree.stick_out_factor`).
         """
-        self.context = context
+        self._setup_actx = array_context
         self.well_sep_is_n_away = well_sep_is_n_away
         self.from_sep_smaller_crit = from_sep_smaller_crit
 
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
+
     # {{{ kernel builder
 
     @memoize_method
@@ -1738,6 +1774,7 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
                 "source_boxes_has_mask": source_boxes_has_mask,
                 "source_parent_boxes_has_mask": source_parent_boxes_has_mask,
                 }
+
         from pyopencl.algorithm import ListOfListsBuilder
 
         from boxtree.tools import ScalarArg, VectorArg
@@ -1873,13 +1910,12 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
 
     # {{{ driver
 
-    def __call__(self, queue, tree, wait_for=None, debug=False,
+    def __call__(self, actx: PyOpenCLArrayContext, tree: Tree,
+                wait_for=None, debug=False,
                  _from_sep_smaller_min_nsources_cumul=None,
                  source_boxes_mask=None,
                  source_parent_boxes_mask=None):
         """
-        :arg queue: A :class:`pyopencl.CommandQueue` instance.
-        :arg tree: A :class:`boxtree.Tree` instance.
         :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
             instances for whose completion this command waits before starting
             execution.
@@ -1888,7 +1924,7 @@ def __call__(self, queue, tree, wait_for=None, debug=False,
         :arg source_parent_boxes_mask: Only boxes passing this mask will be
             considered for `source_parent_boxes`. Used by the distributed
             implementation.
-        :return: A tuple *(trav, event)*, where *trav* is a new instance of
+        :return: A :class:`tuple` *(trav, event)*, where *trav* is a new instance of
             :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event`
             for dependency management.
         """
@@ -1908,16 +1944,17 @@ def __call__(self, queue, tree, wait_for=None, debug=False,
                     "traversal generation")
 
         # FIXME: missing on TreeOfBoxes
+        nlevels = actx.to_numpy(tree.nlevels)
         sources_are_targets = getattr(tree, "sources_are_targets", True)
 
         # Generated code shouldn't depend on the *exact* number of tree levels.
         # So round up to the next multiple of 5.
         from pytools import div_ceil
-        max_levels = div_ceil(tree.nlevels, 5) * 5
+        max_levels = div_ceil(nlevels, 5) * 5
 
         level_start_box_nrs = (
                 None if tree.level_start_box_nrs is None else
-                cl.array.to_device(queue, tree.level_start_box_nrs))
+                tree.level_start_box_nrs)
 
         knl_info = self.get_kernel_info(
                 dimensions=tree.dimensions,
@@ -1933,9 +1970,9 @@ def __call__(self, queue, tree, wait_for=None, debug=False,
                 source_boxes_has_mask=source_boxes_mask is not None,
                 source_parent_boxes_has_mask=source_parent_boxes_mask is not None)
 
-        def fin_debug(s):
+        def debug_with_finish(s):
             if debug:
-                queue.finish()
+                actx.queue.finish()
 
             logger.debug(s)
 
@@ -1943,7 +1980,8 @@ def fin_debug(s):
 
         # {{{ source boxes, their parents, and target boxes
 
-        fin_debug("building list of source boxes, their parents, and target boxes")
+        debug_with_finish(
+            "building list of source boxes, their parents, and target boxes")
 
         extra_args = []
         if source_boxes_mask is not None:
@@ -1952,7 +1990,7 @@ def fin_debug(s):
             extra_args.append(source_parent_boxes_mask)
 
         result, evt = knl_info.sources_parents_and_targets_builder(
-            queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for
+            actx.queue, tree.nboxes, tree.box_flags, *extra_args, wait_for=wait_for
         )
 
         wait_for = [evt]
@@ -1974,43 +2012,44 @@ def extract_level_start_box_nrs(box_list, wait_for):
             if level_start_box_nrs is None:
                 return None, []
 
-            result = cl.array.empty(queue,
-                    tree.nlevels+1, tree.box_id_dtype) \
-                            .fill(len(box_list))
+            result = actx.np.zeros(
+                nlevels + 1, tree.box_id_dtype).fill(len(box_list))
+
             evt = knl_info.level_start_box_nrs_extractor(
                     level_start_box_nrs,
                     tree.box_levels,
                     box_list,
                     result,
                     range=slice(0, len(box_list)),
-                    queue=queue, wait_for=wait_for)
+                    queue=actx.queue, wait_for=wait_for)
 
-            result = result.get()
+            result = actx.to_numpy(result)
 
             # Postprocess result for unoccupied levels
             prev_start = len(box_list)
-            for ilev in range(tree.nlevels-1, -1, -1):
+            for ilev in range(nlevels - 1, -1, -1):
                 result[ilev] = prev_start = \
                         min(result[ilev], prev_start)
 
             return result, [evt]
 
-        fin_debug("finding level starts in source boxes array")
+        debug_with_finish("finding level starts in source boxes array")
         level_start_source_box_nrs, evt_s = \
                 extract_level_start_box_nrs(
                         source_boxes, wait_for=wait_for)
 
-        fin_debug("finding level starts in source parent boxes array")
+        debug_with_finish("finding level starts in source parent boxes array")
         level_start_source_parent_box_nrs, evt_sp = \
                 extract_level_start_box_nrs(
                         source_parent_boxes, wait_for=wait_for)
 
-        fin_debug("finding level starts in target boxes array")
+        debug_with_finish("finding level starts in target boxes array")
         level_start_target_box_nrs, evt_t = \
                 extract_level_start_box_nrs(
                         target_boxes, wait_for=wait_for)
 
-        fin_debug("finding level starts in target or target parent boxes array")
+        debug_with_finish(
+            "finding level starts in target or target parent boxes array")
         level_start_target_or_target_parent_box_nrs, evt_tp = \
                 extract_level_start_box_nrs(
                         target_or_target_parent_boxes, wait_for=wait_for)
@@ -2024,10 +2063,10 @@ def extract_level_start_box_nrs(box_list, wait_for):
         # If well_sep_is_n_away is 1, this agrees with the definition of
         # 'colleagues' from the classical FMM literature.
 
-        fin_debug("finding same-level near-field boxes")
+        debug_with_finish("finding same-level near-field boxes")
 
         result, evt = knl_info.same_level_non_well_sep_boxes_builder(
-                queue, tree.nboxes,
+                actx.queue, tree.nboxes,
                 tree.box_centers.data, tree.root_extent, tree.box_levels,
                 tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
                 wait_for=wait_for)
@@ -2038,10 +2077,10 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
         # {{{ neighbor source boxes ("list 1")
 
-        fin_debug("finding neighbor source boxes ('list 1')")
+        debug_with_finish("finding neighbor source boxes ('list 1')")
 
         result, evt = knl_info.neighbor_source_boxes_builder(
-                queue, len(target_boxes),
+                actx.queue, len(target_boxes),
                 tree.box_centers.data, tree.root_extent, tree.box_levels,
                 tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
                 target_boxes, wait_for=wait_for)
@@ -2053,10 +2092,10 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
         # {{{ well-separated siblings ("list 2")
 
-        fin_debug("finding well-separated siblings ('list 2')")
+        debug_with_finish("finding well-separated siblings ('list 2')")
 
         result, evt = knl_info.from_sep_siblings_builder(
-                queue, len(target_or_target_parent_boxes),
+                actx.queue, len(target_or_target_parent_boxes),
                 tree.box_centers.data, tree.root_extent, tree.box_levels,
                 tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
                 target_or_target_parent_boxes, tree.box_parent_ids.data,
@@ -2072,10 +2111,10 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
         # {{{ separated smaller ("list 3")
 
-        fin_debug("finding separated smaller ('list 3')")
+        debug_with_finish("finding separated smaller ('list 3')")
 
         from_sep_smaller_base_args = (
-                queue, len(target_boxes),
+                actx.queue, len(target_boxes),
                 # base_args
                 tree.box_centers.data, tree.root_extent, tree.box_levels,
                 tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
@@ -2094,8 +2133,8 @@ def extract_level_start_box_nrs(box_list, wait_for):
         from_sep_smaller_by_level = []
         target_boxes_sep_smaller_by_source_level = []
 
-        for ilevel in range(tree.nlevels):
-            fin_debug(f"finding separated smaller ('list 3 level {ilevel}')")
+        for ilevel in range(nlevels):
+            debug_with_finish(f"finding separated smaller ('list 3 level {ilevel}')")
 
             result, evt = knl_info.from_sep_smaller_builder(
                     *from_sep_smaller_base_args, ilevel,
@@ -2110,7 +2149,7 @@ def extract_level_start_box_nrs(box_list, wait_for):
             from_sep_smaller_wait_for.append(evt)
 
         if with_extent:
-            fin_debug("finding separated smaller close ('list 3 close')")
+            debug_with_finish("finding separated smaller close ('list 3 close')")
             result, evt = knl_info.from_sep_smaller_builder(
                     *from_sep_smaller_base_args,
                      -1,
@@ -2131,10 +2170,10 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
         # {{{ separated bigger ("list 4")
 
-        fin_debug("finding separated bigger ('list 4')")
+        debug_with_finish("finding separated bigger ('list 4')")
 
         result, evt = knl_info.from_sep_bigger_builder(
-                queue, len(target_or_target_parent_boxes),
+                actx.queue, len(target_or_target_parent_boxes),
                 tree.box_centers.data, tree.root_extent, tree.box_levels,
                 tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags,
                 tree.stick_out_factor, target_or_target_parent_boxes,
@@ -2152,9 +2191,9 @@ def extract_level_start_box_nrs(box_list, wait_for):
             from_sep_close_bigger_starts_raw = result["from_sep_close_bigger"].starts
             from_sep_close_bigger_lists_raw = result["from_sep_close_bigger"].lists
 
-            list_merger = _ListMerger(queue.context, tree.box_id_dtype)
+            list_merger = _ListMerger(actx, tree.box_id_dtype)
             result, evt = list_merger(
-                    queue,
+                    actx,
                     # starts
                     (from_sep_close_bigger_starts_raw,),
                     # lists
@@ -2183,43 +2222,35 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
         # }}}
 
-        if self.well_sep_is_n_away == 1:
-            colleagues_starts = same_level_non_well_sep_boxes.starts
-            colleagues_lists = same_level_non_well_sep_boxes.lists
-        else:
-            colleagues_starts = None
-            colleagues_lists = None
-
         evt, = wait_for
-
         traversal_plog.done(
                 "from_sep_smaller_crit: %s",
                 self.from_sep_smaller_crit)
 
-        return FMMTraversalInfo(
+        info = FMMTraversalInfo(
                 tree=tree,
                 well_sep_is_n_away=self.well_sep_is_n_away,
 
                 source_boxes=source_boxes,
                 target_boxes=target_boxes,
 
-                level_start_source_box_nrs=level_start_source_box_nrs,
-                level_start_target_box_nrs=level_start_target_box_nrs,
+                level_start_source_box_nrs=actx.from_numpy(
+                    level_start_source_box_nrs),
+                level_start_target_box_nrs=actx.from_numpy(
+                    level_start_target_box_nrs),
 
                 source_parent_boxes=source_parent_boxes,
-                level_start_source_parent_box_nrs=level_start_source_parent_box_nrs,
+                level_start_source_parent_box_nrs=actx.from_numpy(
+                    level_start_source_parent_box_nrs),
 
                 target_or_target_parent_boxes=target_or_target_parent_boxes,
-                level_start_target_or_target_parent_box_nrs=(
+                level_start_target_or_target_parent_box_nrs=actx.from_numpy(
                     level_start_target_or_target_parent_box_nrs),
 
                 same_level_non_well_sep_boxes_starts=(
                     same_level_non_well_sep_boxes.starts),
                 same_level_non_well_sep_boxes_lists=(
                     same_level_non_well_sep_boxes.lists),
-                # Deprecated, but we'll keep these alive for the time being.
-                colleagues_starts=colleagues_starts,
-                colleagues_lists=colleagues_lists,
 
                 neighbor_source_boxes_starts=neighbor_source_boxes.starts,
                 neighbor_source_boxes_lists=neighbor_source_boxes.lists,
@@ -2227,8 +2258,9 @@ def extract_level_start_box_nrs(box_list, wait_for):
                 from_sep_siblings_starts=from_sep_siblings.starts,
                 from_sep_siblings_lists=from_sep_siblings.lists,
 
-                from_sep_smaller_by_level=from_sep_smaller_by_level,
-                target_boxes_sep_smaller_by_source_level=(
+                from_sep_smaller_by_level=make_obj_array(
+                    from_sep_smaller_by_level),
+                target_boxes_sep_smaller_by_source_level=make_obj_array(
                     target_boxes_sep_smaller_by_source_level),
 
                 from_sep_close_smaller_starts=from_sep_close_smaller_starts,
@@ -2239,7 +2271,9 @@ def extract_level_start_box_nrs(box_list, wait_for):
 
                 from_sep_close_bigger_starts=from_sep_close_bigger_starts,
                 from_sep_close_bigger_lists=from_sep_close_bigger_lists,
-                ).with_queue(None), evt
+                )
+
+        return actx.freeze(info), evt
 
     # }}}
 

From f2140cabba248d19ff13ba4cf6dabce0f9756ef6 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Tue, 21 Jun 2022 21:13:31 +0300
Subject: [PATCH 08/28] port tree to arraycontext

---
 boxtree/tree.py            | 324 ++++++++++++++++++-------------------
 test/test_tree_of_boxes.py |   5 +-
 2 files changed, 163 insertions(+), 166 deletions(-)

diff --git a/boxtree/tree.py b/boxtree/tree.py
index ade9d965..857c04cc 100644
--- a/boxtree/tree.py
+++ b/boxtree/tree.py
@@ -81,16 +81,16 @@
 """
 
 import logging
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from functools import cached_property
 
 import numpy as np
+from arraycontext import Array
 
-import pyopencl as cl
 from cgen import Enum
 from pytools import memoize_method
 
-from boxtree.tools import DeviceDataRecord
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 
 logger = logging.getLogger(__name__)
@@ -141,7 +141,8 @@ class box_flags_enum(Enum):  # noqa
 
 # {{{ tree of boxes
 
-@dataclass
+@dataclass_array_container
+@dataclass(frozen=True)
 class TreeOfBoxes:
     """A quad/octree tree of pure boxes, excluding their contents (e.g.
     particles).  It is a lightweight tree handled with :mod:`numpy`, intended
@@ -215,15 +216,15 @@ class TreeOfBoxes:
     .. automethod:: __init__
     """
 
-    root_extent: np.ndarray
-    box_centers: np.ndarray
+    root_extent: Array
+    box_centers: Array
 
-    box_parent_ids: np.ndarray
-    box_child_ids: np.ndarray
-    box_levels: np.ndarray
+    box_parent_ids: Array
+    box_child_ids: Array
+    box_levels: Array
 
-    box_flags: np.ndarray | None
-    level_start_box_nrs: np.ndarray | None
+    box_flags: Array | None
+    level_start_box_nrs: Array | None
 
     # FIXME: these should be properties and take values from box_parent_ids, etc
     box_id_dtype: np.dtype
@@ -251,11 +252,7 @@ def aligned_nboxes(self):
 
     @property
     def nlevels(self):
-        # level starts from 0
-        if isinstance(self.box_levels, cl.array.Array):
-            return int(max(self.box_levels).get()) + 1
-        else:
-            return max(self.box_levels) + 1
+        return max(self.box_levels) + 1
 
     @property
     def leaf_boxes(self):
@@ -288,7 +285,9 @@ def get_box_extent(self, ibox):
 
 # {{{ tree with particles
 
-class Tree(DeviceDataRecord, TreeOfBoxes):
+@dataclass_array_container
+@dataclass(frozen=True)
+class Tree(TreeOfBoxes):
     r"""A quad/octree consisting of particles sorted into a hierarchy of boxes.
 
     Optionally, particles may be designated 'sources' and 'targets'. They
@@ -298,9 +297,6 @@ class Tree(DeviceDataRecord, TreeOfBoxes):
     Instances of this class are not constructed directly. They are returned
     by :meth:`TreeBuilder.__call__`.
 
-    Unless otherwise indicated, all bulk data in this data structure is stored
-    in a :class:`pyopencl.array.Array`. See also :meth:`get`.
-
     Inherits from :class:`TreeOfBoxes`.
 
     .. rubric:: Flags
@@ -379,13 +375,6 @@ class Tree(DeviceDataRecord, TreeOfBoxes):
         in each level, access the start of the next level. This array is
         built so that this works even for the last level.
 
-    .. attribute:: level_start_box_nrs_dev
-
-        ``particle_id_t [nlevels+1]``
-
-        The same array as :attr:`level_start_box_nrs`
-        as a :class:`pyopencl.array.Array`.
-
     .. ------------------------------------------------------------------------
     .. rubric:: Per-particle arrays
     .. ------------------------------------------------------------------------
@@ -554,12 +543,43 @@ class Tree(DeviceDataRecord, TreeOfBoxes):
     .. attribute:: box_target_bounding_box_max
 
         ``coordt_t [dimensions, aligned_nboxes]``
-
-    .. rubric:: Methods
-
-    .. automethod:: get
     """
 
+    # flags
+    sources_are_targets: bool
+
+    # data types
+    particle_id_dtype: np.dtype
+
+    # per-particle arrays
+    sources: Array
+    source_radii: Array
+    targets: Array
+    target_radii: Array
+
+    # FIXME: this needs to be init=True to overwrite the cached property in
+    # the base class. That fails because `x[:, 0] - c` tries to do arithmetic
+    # on a non-contiguous array and is not supported by pyopencl
+    bounding_box: tuple[Array, Array] = field(init=True)
+
+    # tree / user order indices
+    user_source_ids: Array
+    sorted_target_ids: Array
+
+    # box properties
+    box_source_starts: Array
+    box_source_counts_nonchild: Array
+    box_source_counts_cumul: Array
+    box_target_starts: Array
+    box_target_counts_nonchild: Array
+    box_target_counts_cumul: Array
+
+    # particle-adaptive box extents
+    box_source_bounding_box_min: Array
+    box_source_bounding_box_max: Array
+    box_target_bounding_box_min: Array
+    box_target_bounding_box_max: Array
+
     @property
     def dimensions(self):
         return len(self.sources)
@@ -582,6 +602,8 @@ def ntargets(self):
     def nlevels(self):
         return len(self.level_start_box_nrs) - 1
 
+    # {{{ dummy interface for TreePlotter
+
     def plot(self, **kwargs):
         from boxtree.visualization import TreePlotter
         plotter = TreePlotter(self)
@@ -595,9 +617,11 @@ def get_box_extent(self, ibox):
         extent_high = extent_low + box_size
         return extent_low, extent_high
 
+    # }}}
+
     # {{{ debugging aids
 
-    # these assume numpy arrays (i.e. post-.get()), for now
+    # these assume numpy arrays for now
 
     def _reverse_index_lookup(self, ary, new_key_size):
         result = np.empty(new_key_size, ary.dtype)
@@ -642,26 +666,13 @@ def find_box_nr_for_source(self, isource):
 
     # }}}
 
-    def to_device(self, queue, exclude_fields=frozenset()):
-        # level_start_box_nrs should remain in host memory
-        exclude_fields = set(exclude_fields)
-        exclude_fields.add("level_start_box_nrs")
-
-        return super().to_device(queue, frozenset(exclude_fields))
-
-    def to_host_device_array(self, queue, exclude_fields=frozenset()):
-        # level_start_box_nrs should remain in host memory
-        exclude_fields = set(exclude_fields)
-        exclude_fields.add("level_start_box_nrs")
-
-        return super().to_host_device_array(
-            queue, frozenset(exclude_fields))
-
 # }}}
 
 
 # {{{ tree with linked point sources
 
+@dataclass_array_container
+@dataclass(frozen=True)
 class TreeWithLinkedPointSources(Tree):
     """In this :class:`boxtree.Tree` subclass, the sources of the original tree are
     linked with extent are expanded into point sources which are linked to the
@@ -724,20 +735,26 @@ class TreeWithLinkedPointSources(Tree):
 
         This constructor is not intended to be called by users directly.
         Call :func:`link_point_sources` instead.
-
-    .. rubric:: Methods
-
-    .. automethod:: get
     """
 
+    npoint_sources: int
+    point_source_starts: Array
+    point_source_counts: Array
+    point_sources: Array
+    user_point_source_ids: Array
+    box_point_source_starts: Array
+    box_point_source_counts_nonchild: Array
+    box_point_source_counts_cumul: Array
+
 
-def link_point_sources(queue, tree, point_source_starts, point_sources,
-        debug=False):
+def link_point_sources(
+        actx: PyOpenCLArrayContext, tree: Tree,
+        point_source_starts: Array, point_sources: Array, *,
+        debug: bool = False):
     r"""
     *Construction:* Requires that :attr:`boxtree.Tree.sources_have_extent` is *True*
     on *tree*.
 
-    :arg queue: a :class:`pyopencl.CommandQueue` instance
     :arg point_source_starts: ``point_source_starts[isrc]`` and
         ``point_source_starts[isrc+1]`` together indicate a ranges of point
         particle indices in *point_sources* which will be linked to the
@@ -759,21 +776,21 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
     if not tree.sources_have_extent:
         raise ValueError("only allowed on trees whose sources have extent")
 
-    npoint_sources_dev = cl.array.empty(queue, (), tree.particle_id_dtype)
+    npoint_sources_dev = actx.np.zeros((), tree.particle_id_dtype)
 
     # {{{ compute tree_order_point_source_{starts, counts}
 
     # Scan over lengths of point source lists in tree order to determine
     # indices of point source starts for each source.
 
-    tree_order_point_source_starts = cl.array.empty(
-            queue, tree.nsources, tree.particle_id_dtype)
-    tree_order_point_source_counts = cl.array.empty(
-            queue, tree.nsources, tree.particle_id_dtype)
+    tree_order_point_source_starts = actx.np.zeros(
+            tree.nsources, tree.particle_id_dtype)
+    tree_order_point_source_counts = actx.np.zeros(
+            tree.nsources, tree.particle_id_dtype)
 
     from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_SOURCE_SCAN_TPL
     knl = POINT_SOURCE_LINKING_SOURCE_SCAN_TPL.build(
-        queue.context,
+        actx.queue.context,
         type_aliases=(
             ("scan_t", tree.particle_id_dtype),
             ("index_t", tree.particle_id_dtype),
@@ -785,39 +802,40 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
 
     knl(point_source_starts, tree.user_source_ids,
             tree_order_point_source_starts, tree_order_point_source_counts,
-            npoint_sources_dev, size=tree.nsources, queue=queue)
+            npoint_sources_dev, size=tree.nsources, queue=actx.queue)
 
     # }}}
 
-    npoint_sources = int(npoint_sources_dev.get())
+    npoint_sources = int(actx.to_numpy(npoint_sources_dev))
 
     # {{{ compute user_point_source_ids
 
     # A list of point source starts, indexed in tree order,
     # but giving point source indices in user order.
-    tree_order_index_user_point_source_starts = cl.array.take(
-            point_source_starts, tree.user_source_ids,
-            queue=queue)
+    tree_order_index_user_point_source_starts = (
+            point_source_starts[tree.user_source_ids])
 
-    user_point_source_ids = cl.array.empty(
-            queue, npoint_sources, tree.particle_id_dtype)
+    user_point_source_ids = actx.np.zeros(npoint_sources, tree.particle_id_dtype)
     user_point_source_ids.fill(1)
-    cl.array.multi_put([tree_order_index_user_point_source_starts],
+
+    import pyopencl.array as cl_array
+    cl_array.multi_put(
+            [tree_order_index_user_point_source_starts],
             dest_indices=tree_order_point_source_starts,
             out=[user_point_source_ids])
 
     if debug:
-        ups_host = user_point_source_ids.get()
-        assert (ups_host >= 0).all()
-        assert (ups_host < npoint_sources).all()
+        ups_host = actx.to_numpy(user_point_source_ids)
+        assert np.all(ups_host >= 0)
+        assert np.all(ups_host < npoint_sources)
 
-    source_boundaries = cl.array.zeros(queue, npoint_sources, np.int8)
+    source_boundaries = actx.zeros(npoint_sources, np.int8)
 
     # FIXME: Should be a scalar, in principle.
-    ones = cl.array.empty(queue, tree.nsources, np.int8)
+    ones = actx.np.zeros(tree.nsources, np.int8)
     ones.fill(1)
 
-    cl.array.multi_put(
+    cl_array.multi_put(
             [ones],
             dest_indices=tree_order_point_source_starts,
             out=[source_boundaries])
@@ -829,7 +847,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
     logger.debug("point source linking: point source id scan")
 
     knl = POINT_SOURCE_LINKING_USER_POINT_SOURCE_ID_SCAN_TPL.build(
-        queue.context,
+        actx.queue.context,
         type_aliases=(
             ("scan_t", tree.particle_id_dtype),
             ("index_t", tree.particle_id_dtype),
@@ -837,19 +855,18 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
             ),
         )
     knl(source_boundaries, user_point_source_ids,
-            size=npoint_sources, queue=queue)
+            size=npoint_sources, queue=actx.queue)
 
     if debug:
-        ups_host = user_point_source_ids.get()
-        assert (ups_host >= 0).all()
-        assert (ups_host < npoint_sources).all()
+        ups_host = actx.to_numpy(user_point_source_ids)
+        assert np.all(ups_host >= 0)
+        assert np.all(ups_host < npoint_sources)
 
     # }}}
 
     from pytools.obj_array import make_obj_array
     tree_order_point_sources = make_obj_array([
-        cl.array.take(point_sources[i], user_point_source_ids,
-            queue=queue)
+        cl_array.take(point_sources[i], user_point_source_ids, queue=actx.queue)
         for i in range(tree.dimensions)
         ])
 
@@ -858,7 +875,7 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
     from boxtree.tree_build_kernels import POINT_SOURCE_LINKING_BOX_POINT_SOURCES
 
     knl = POINT_SOURCE_LINKING_BOX_POINT_SOURCES.build(
-        queue.context,
+        actx.queue.context,
         type_aliases=(
             ("particle_id_t", tree.particle_id_dtype),
             ("box_id_t", tree.box_id_dtype),
@@ -867,12 +884,10 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
 
     logger.debug("point source linking: box point sources")
 
-    box_point_source_starts = cl.array.empty(
-            queue, tree.nboxes, tree.particle_id_dtype)
-    box_point_source_counts_nonchild = cl.array.empty(
-            queue, tree.nboxes, tree.particle_id_dtype)
-    box_point_source_counts_cumul = cl.array.empty(
-            queue, tree.nboxes, tree.particle_id_dtype)
+    box_point_source_starts = actx.np.zeros(tree.nboxes, tree.particle_id_dtype)
+    box_point_source_counts_cumul = actx.np.zeros(tree.nboxes, tree.particle_id_dtype)
+    box_point_source_counts_nonchild = actx.np.zeros(
+            tree.nboxes, tree.particle_id_dtype)
 
     knl(
             box_point_source_starts, box_point_source_counts_nonchild,
@@ -883,20 +898,21 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
 
             tree_order_point_source_starts,
             tree_order_point_source_counts,
-            range=slice(tree.nboxes), queue=queue)
+            range=slice(tree.nboxes), queue=actx.queue)
 
     # }}}
 
     logger.info("point source linking: complete")
 
+    from dataclasses import fields
     tree_attrs = {}
-    for attr_name in tree.__class__.fields:
+    for f in fields(tree):
         try:  # noqa: SIM105
-            tree_attrs[attr_name] = getattr(tree, attr_name)
+            tree_attrs[f.name] = getattr(tree, f.name)
         except AttributeError:
             pass
 
-    return TreeWithLinkedPointSources(
+    tree_with_point_sources = TreeWithLinkedPointSources(
             npoint_sources=npoint_sources,
             point_source_starts=tree_order_point_source_starts,
             point_source_counts=tree_order_point_source_counts,
@@ -906,7 +922,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
             box_point_source_counts_nonchild=box_point_source_counts_nonchild,
             box_point_source_counts_cumul=box_point_source_counts_cumul,
 
-            **tree_attrs).with_queue(None)
+            **tree_attrs)
+
+    return actx.freeze(tree_with_point_sources)
 
 
 # }}}
@@ -914,7 +932,9 @@ def link_point_sources(queue, tree, point_source_starts, point_sources,
 
 # {{{ particle list filter
 
-class FilteredTargetListsInUserOrder(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class FilteredTargetListsInUserOrder:
     """Use :meth:`ParticleListFilter.filter_target_lists_in_user_order` to create
     instances of this class.
 
@@ -947,14 +967,16 @@ class FilteredTargetListsInUserOrder(DeviceDataRecord):
         child boxes).  Use together with :attr:`target_starts`.
 
         Target numbers are stored in user order, as the class name suggests.
-
-    .. rubric:: Methods
-
-    .. automethod:: get
     """
 
+    nfiltered_targets: int
+    target_starts: Array
+    target_lists: Array
+
 
-class FilteredTargetListsInTreeOrder(DeviceDataRecord):
+@dataclass_array_container
+@dataclass(frozen=True)
+class FilteredTargetListsInTreeOrder:
     """Use :meth:`ParticleListFilter.filter_target_lists_in_tree_order` to create
     instances of this class.
 
@@ -999,12 +1021,14 @@ class FilteredTargetListsInTreeOrder(DeviceDataRecord):
         Storing *to* these indices will reorder the targets
         from *filtered* tree target order into 'regular'
         :ref:`tree target order <particle-orderings>`.
-
-    .. rubric:: Methods
-
-    .. automethod:: get
     """
 
+    nfiltered_targets: int
+    box_target_starts: Array
+    box_target_counts_nonchild: Array
+    targets: Array
+    unfiltered_from_filtered_target_indices: Array
+
 
 class ParticleListFilter:
     """
@@ -1012,8 +1036,12 @@ class ParticleListFilter:
     .. automethod:: filter_target_lists_in_user_order
     """
 
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
     def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
@@ -1055,7 +1083,7 @@ def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
 
         return builder
 
-    def filter_target_lists_in_user_order(self, queue, tree, flags):
+    def filter_target_lists_in_user_order(self, actx, tree, flags):
         """
         :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
             :class:`numpy.int8` objects, which indicate by being zero that the
@@ -1067,25 +1095,27 @@ def filter_target_lists_in_user_order(self, queue, tree, flags):
         user_order_flags = flags
         del flags
 
-        user_target_ids = cl.array.empty(queue, tree.ntargets,
-                tree.sorted_target_ids.dtype)
-        user_target_ids[tree.sorted_target_ids] = cl.array.arange(
-                queue, tree.ntargets, user_target_ids.dtype)
+        user_target_ids = actx.np.zeros(tree.ntargets, tree.sorted_target_ids.dtype)
+        user_target_ids[tree.sorted_target_ids] = actx.from_numpy(
+                np.arange(tree.ntargets, dtype=user_target_ids.dtype)
+                )
 
         kernel = self.get_filter_target_lists_in_user_order_kernel(
                 tree.particle_id_dtype, user_order_flags.dtype)
 
-        result, _evt = kernel(queue, tree.nboxes,
+        result, _evt = kernel(actx.queue, tree.nboxes,
                 user_order_flags,
                 user_target_ids,
                 tree.box_target_starts,
                 tree.box_target_counts_nonchild)
 
-        return FilteredTargetListsInUserOrder(
+        target_lists = FilteredTargetListsInUserOrder(
                 nfiltered_targets=result["filt_tgt_list"].count,
                 target_starts=result["filt_tgt_list"].starts,
                 target_lists=result["filt_tgt_list"].lists,
-                ).with_queue(None)
+                )
+
+        return actx.freeze(target_lists)
 
     @memoize_method
     def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype):
@@ -1111,7 +1141,7 @@ def get_filter_target_lists_in_tree_order_kernels(self, particle_id_dtype):
 
         return scan_knl, index_knl
 
-    def filter_target_lists_in_tree_order(self, queue, tree, flags):
+    def filter_target_lists_in_tree_order(self, actx, tree, flags):
         """
         :arg flags: an array of length :attr:`boxtree.Tree.ntargets` of
             :class:`numpy.int8` objects, which indicate by being zero that the
@@ -1120,15 +1150,15 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags):
         :returns: A :class:`FilteredTargetListsInTreeOrder`
         """
 
-        tree_order_flags = cl.array.empty(queue, tree.ntargets, np.int8)
+        tree_order_flags = actx.np.zeros(tree.ntargets, np.int8)
         tree_order_flags[tree.sorted_target_ids] = flags
 
-        filtered_from_unfiltered_target_indices = cl.array.empty(
-                queue, tree.ntargets, tree.particle_id_dtype)
-        unfiltered_from_filtered_target_indices = cl.array.empty(
-                queue, tree.ntargets, tree.particle_id_dtype)
+        filtered_from_unfiltered_target_indices = actx.np.zeros(
+                tree.ntargets, tree.particle_id_dtype)
+        unfiltered_from_filtered_target_indices = actx.np.zeros(
+                tree.ntargets, tree.particle_id_dtype)
 
-        nfiltered_targets = cl.array.empty(queue, 1, tree.particle_id_dtype)
+        nfiltered_targets = actx.np.zeros(1, tree.particle_id_dtype)
 
         scan_knl, index_knl = self.get_filter_target_lists_in_tree_order_kernels(
                 tree.particle_id_dtype)
@@ -1137,23 +1167,21 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags):
                 filtered_from_unfiltered_target_indices,
                 unfiltered_from_filtered_target_indices,
                 nfiltered_targets,
-                queue=queue)
-
-        nfiltered_targets = int(nfiltered_targets.get().item())
+                queue=actx.queue)
 
+        nfiltered_targets = int(actx.to_numpy(nfiltered_targets).item())
         unfiltered_from_filtered_target_indices = \
                 unfiltered_from_filtered_target_indices[:nfiltered_targets]
 
         from pytools.obj_array import make_obj_array
         filtered_targets = make_obj_array([
-            targets_i.with_queue(queue)[unfiltered_from_filtered_target_indices]
+            actx.thaw(targets_i)[unfiltered_from_filtered_target_indices]
             for targets_i in tree.targets
             ])
 
-        box_target_starts_filtered = \
-                cl.array.empty_like(tree.box_target_starts)
-        box_target_counts_nonchild_filtered = \
-                cl.array.empty_like(tree.box_target_counts_nonchild)
+        box_target_starts_filtered = actx.np.zeros_like(tree.box_target_starts)
+        box_target_counts_nonchild_filtered = (
+                actx.np.zeros_like(tree.box_target_counts_nonchild))
 
         index_knl(
                 # input
@@ -1167,51 +1195,19 @@ def filter_target_lists_in_tree_order(self, queue, tree, flags):
                 box_target_starts_filtered,
                 box_target_counts_nonchild_filtered,
 
-                queue=queue)
+                queue=actx.queue)
 
-        return FilteredTargetListsInTreeOrder(
+        target_lists = FilteredTargetListsInTreeOrder(
                 nfiltered_targets=nfiltered_targets,
                 box_target_starts=box_target_starts_filtered,
                 box_target_counts_nonchild=box_target_counts_nonchild_filtered,
                 unfiltered_from_filtered_target_indices=(
                     unfiltered_from_filtered_target_indices),
                 targets=filtered_targets,
-                ).with_queue(None)
-
-# }}}
-
-
-# {{{ filter_target_lists_in_*_order
-
-def filter_target_lists_in_user_order(queue, tree, flags):
-    """
-    Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_user_order`.
-    """
-
-    from warnings import warn
-    warn(
-            "filter_target_lists_in_user_order() is deprecated and will go "
-            "away in a future release. Use "
-            "ParticleListFilter.filter_target_lists_in_user_order() instead.",
-            DeprecationWarning, stacklevel=2)
-
-    return (ParticleListFilter(queue.context)
-            .filter_target_lists_in_user_order(queue, tree, flags))
+                )
 
+        return actx.freeze(target_lists)
 
-def filter_target_lists_in_tree_order(queue, tree, flags):
-    """
-    Deprecated. See :meth:`ParticleListFilter.filter_target_lists_in_tree_order`.
-    """
-    from warnings import warn
-    warn(
-            "filter_target_lists_in_tree_order() is deprecated and will go "
-            "away in a future release. Use "
-            "ParticleListFilter.filter_target_lists_in_tree_order() instead.",
-            DeprecationWarning, stacklevel=2)
-
-    return (ParticleListFilter(queue.context)
-            .filter_target_lists_in_tree_order(queue, tree, flags))
 # }}}
 
 # vim: filetype=pyopencl:fdm=marker
diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py
index 33bdefd5..b7e798a3 100644
--- a/test/test_tree_of_boxes.py
+++ b/test/test_tree_of_boxes.py
@@ -228,11 +228,12 @@ def test_traversal_from_tob(actx_factory):
         box_child_ids=actx.from_numpy(tob.box_child_ids),
         box_levels=actx.from_numpy(tob.box_levels),
         box_flags=actx.from_numpy(tob.box_flags),
+        level_start_box_nrs=actx.from_numpy(tob.level_start_box_nrs),
         )
 
     from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context)
-    _trav, _ = tg(actx.queue, tob)
+    tg = FMMTraversalBuilder(actx)
+    _trav, _ = tg(actx, tob)
 
 # }}}
 

From e6b8c812a77f5822d08659ae0448b3d280033e71 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 22 Jun 2022 14:28:51 +0300
Subject: [PATCH 09/28] port tools to arraycontext

---
 boxtree/tools.py | 112 +++++++++++++++++++++++++----------------------
 1 file changed, 60 insertions(+), 52 deletions(-)

diff --git a/boxtree/tools.py b/boxtree/tools.py
index 0af213d6..ab4240c6 100644
--- a/boxtree/tools.py
+++ b/boxtree/tools.py
@@ -33,6 +33,8 @@
 from pytools import Record, memoize_method
 from pytools.obj_array import make_obj_array
 
+from boxtree.array_context import PyOpenCLArrayContext
+
 
 # Use offsets in VectorArg by default.
 VectorArg = partial(_VectorArg, with_offset=True)
@@ -46,26 +48,25 @@ def padded_bin(i, nbits):
 
 
 # NOTE: Order of positional args should match GappyCopyAndMapKernel.__call__()
-def realloc_array(queue, allocator, new_shape, ary, zero_fill=False, wait_for=None):
+def realloc_array(actx, new_shape, ary, zero_fill=False, wait_for=None):
     if wait_for is None:
         wait_for = []
 
-    if zero_fill:  # noqa: SIM108
-        array_maker = cl.array.zeros
-    else:
-        array_maker = cl.array.empty
-
-    new_ary = array_maker(queue, shape=new_shape, dtype=ary.dtype,
-                          allocator=allocator)
+    if not zero_fill:
+        from warnings import warn
+        warn("Setting 'zero_fill=False' has no effect and will become an error "
+             "in 2025. Always use 'zero_fill=True'",
+             DeprecationWarning, stacklevel=2)
 
-    evt = cl.enqueue_copy(queue, new_ary.data, ary.data, byte_count=ary.nbytes,
-                          wait_for=wait_for + new_ary.events)
+    new_ary = actx.np.zeros(shape=new_shape, dtype=ary.dtype)
+    evt = cl.enqueue_copy(actx.queue, new_ary.data, ary.data,
+        byte_count=ary.nbytes,
+        wait_for=wait_for + new_ary.events)
 
     return new_ary, evt
 
 
-def reverse_index_array(indices, target_size=None, result_fill_value=None,
-        queue=None):
+def reverse_index_array(actx, indices, target_size=None, result_fill_value=None):
     """For an array of *indices*, return a new array *result* that satisfies
     ``result[indices] == arange(len(indices))``
 
@@ -75,38 +76,34 @@ def reverse_index_array(indices, target_size=None, result_fill_value=None,
         prior to storing reversed indices.
     """
 
-    queue = queue or indices.queue
-
     if target_size is None:
         target_size = len(indices)
 
-    result = cl.array.empty(queue, target_size, indices.dtype)
+    result = actx.np.zeros(target_size, indices.dtype)
 
     if result_fill_value is not None:
         result.fill(result_fill_value)
 
     cl.array.multi_put(
-            [cl.array.arange(queue, len(indices), dtype=indices.dtype,
-                allocator=indices.allocator)],
+            [actx.from_numpy(np.arange(len(indices), dtype=indices.dtype))],
             indices,
             out=[result],
-            queue=queue)
+            queue=actx.queue)
 
     return result
 
 
 # {{{ particle distribution generators
 
-def make_normal_particle_array(queue, nparticles, dims, dtype, seed=15):
-    from pyopencl.clrandom import PhiloxGenerator
-    rng = PhiloxGenerator(queue.context, seed=seed)
-
+def make_normal_particle_array(actx, nparticles, dims, dtype, seed=15):
+    rng = np.random.default_rng(seed)
     return make_obj_array([
-        rng.normal(queue, nparticles, dtype=dtype)
-        for i in range(dims)])
+        actx.from_numpy(rng.standard_normal(nparticles, dtype=dtype))
+        for i in range(dims)
+        ])
 
 
-def make_surface_particle_array(queue, nparticles, dims, dtype, seed=15):
+def make_surface_particle_array(actx, nparticles, dims, dtype, seed=15):
     import loopy as lp
 
     if dims == 2:
@@ -132,7 +129,7 @@ def get_2d_knl(dtype):
 
             return knl.executor(queue.context)
 
-        _evt, result = get_2d_knl(dtype)(queue, n=nparticles)
+        _evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles)
 
         result = [x.ravel() for x in result]
 
@@ -166,7 +163,7 @@ def get_3d_knl(dtype):
 
             return knl.executor(queue.context)
 
-        _evt, result = get_3d_knl(dtype)(queue, n=n)
+        _evt, result = get_3d_knl(dtype)(actx.queue, n=n)
 
         result = [x.ravel() for x in result]
 
@@ -175,7 +172,7 @@ def get_3d_knl(dtype):
         raise NotImplementedError
 
 
-def make_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
+def make_uniform_particle_array(actx, nparticles, dims, dtype, seed=15):
     import loopy as lp
 
     if dims == 2:
@@ -209,7 +206,7 @@ def get_2d_knl(dtype):
 
             return knl.executor(queue.context)
 
-        _evt, result = get_2d_knl(dtype)(queue, n=n)
+        _evt, result = get_2d_knl(dtype)(actx.queue, n=n)
 
         result = [x.ravel() for x in result]
 
@@ -257,7 +254,7 @@ def get_3d_knl(dtype):
 
             return knl.executor(queue.context)
 
-        _evt, result = get_3d_knl(dtype)(queue, n=n)
+        _evt, result = get_3d_knl(dtype)(actx.queue, n=n)
 
         result = [x.ravel() for x in result]
 
@@ -266,14 +263,14 @@ def get_3d_knl(dtype):
         raise NotImplementedError
 
 
-def make_rotated_uniform_particle_array(queue, nparticles, dims, dtype, seed=15):
+def make_rotated_uniform_particle_array(actx, nparticles, dims, dtype, seed=15):
     raise NotImplementedError
 
 # }}}
 
 
-def particle_array_to_host(parray):
-    return np.array([x.get() for x in parray], order="F").T
+def particle_array_to_host(actx, particles):
+    return np.array([actx.to_numpy(x) for x in particles], order="F").T
 
 
 # {{{ host/device data storage
@@ -458,8 +455,12 @@ def get_type_moniker(dtype):
 
 
 class GappyCopyAndMapKernel:
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
     def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
@@ -497,7 +498,7 @@ def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
                 name="gappy_copy_and_map")
 
     # NOTE: Order of positional args should match realloc_array()
-    def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
+    def __call__(self, actx, new_shape, ary, src_indices=None,
                  dst_indices=None, map_values=None, zero_fill=False,
                  wait_for=None, range=None, debug=False):
         """Compresses box info arrays after empty leaf pruning and, optionally,
@@ -519,19 +520,19 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
             elif have_src_indices:
                 range = slice(src_indices.shape[0])
                 if debug:
-                    assert int(cl.array.max(src_indices).get()) < len(ary)
+                    assert int(actx.to_numpy(actx.np.amax(src_indices))) < len(ary)
             elif have_dst_indices:
                 range = slice(dst_indices.shape[0])
                 if debug:
-                    assert int(cl.array.max(dst_indices).get()) < new_shape
-
-        if zero_fill:  # noqa: SIM108
-            array_maker = cl.array.zeros
-        else:
-            array_maker = cl.array.empty
+                    assert int(actx.to_numpy(actx.np.amax(dst_indices))) < new_shape
 
-        result = array_maker(queue, new_shape, ary.dtype, allocator=allocator)
+        if not zero_fill:
+            from warnings import warn
+            warn("Setting 'zero_fill=False' has no effect and will become an error "
+                "in 2025. Always use 'zero_fill=True'",
+                DeprecationWarning, stacklevel=2)
 
+        result = actx.np.zeros(new_shape, ary.dtype)
         kernel = self._get_kernel(ary.dtype,
                                   src_indices.dtype if have_src_indices else None,
                                   dst_indices.dtype if have_dst_indices else None,
@@ -544,7 +545,7 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
         args += (dst_indices,) if have_dst_indices else ()
         args += (map_values,) if have_map_values else ()
 
-        evt = kernel(*args, queue=queue, range=range, wait_for=wait_for)
+        evt = kernel(*args, queue=actx.queue, range=range, wait_for=wait_for)
 
         return result, evt
 
@@ -569,9 +570,12 @@ def __call__(self, queue, allocator, new_shape, ary, src_indices=None,
 
 
 class MapValuesKernel:
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
 
-    def __init__(self, context):
-        self.context = context
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
     def _get_kernel(self, dst_dtype, src_dtype):
@@ -685,8 +689,12 @@ class MaskCompressorKernel:
     """
     .. automethod:: __call__
     """
-    def __init__(self, context):
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext):
+        self._setup_actx = array_context
+
+    @property
+    def context(self):
+        return self._setup_actx.context
 
     @memoize_method
     def get_list_compressor_kernel(self, mask_dtype, list_dtype):
@@ -717,7 +725,7 @@ def get_matrix_compressor_kernel(self, mask_dtype, list_dtype):
                 ],
                 name_prefix="compress_matrix")
 
-    def __call__(self, queue, mask, list_dtype=None):
+    def __call__(self, actx, mask, list_dtype=None):
         """Convert a mask to a list in :ref:`csr` format.
 
         :arg mask: Either a 1D or 2D array.
@@ -739,7 +747,7 @@ def __call__(self, queue, mask, list_dtype=None):
 
         if len(mask.shape) == 1:
             knl = self.get_list_compressor_kernel(mask.dtype, list_dtype)
-            result, evt = knl(queue, mask.shape[0], mask.data)
+            result, evt = knl(actx.queue, mask.shape[0], mask.data)
             return (result["output"].lists, evt)
         elif len(mask.shape) == 2:
             # FIXME: This is efficient for small column sizes but may not be
@@ -747,7 +755,7 @@ def __call__(self, queue, mask, list_dtype=None):
             knl = self.get_matrix_compressor_kernel(mask.dtype, list_dtype)
             size = mask.dtype.itemsize
             assert size > 0
-            result, evt = knl(queue, mask.shape[0], mask.shape[1],
+            result, evt = knl(actx.queue, mask.shape[0], mask.shape[1],
                               mask.strides[0] // size, mask.strides[1] // size,
                               mask.data)
             return (result["output"].starts, result["output"].lists, evt)

From c0d02d403c796506c3d9238cecd5075431c7728c Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 22 Jun 2022 14:29:05 +0300
Subject: [PATCH 10/28] port tree_build to arraycontext

---
 boxtree/tree_build.py | 392 ++++++++++++++++++++++--------------------
 1 file changed, 207 insertions(+), 185 deletions(-)

diff --git a/boxtree/tree_build.py b/boxtree/tree_build.py
index f16c1221..d91f1e89 100644
--- a/boxtree/tree_build.py
+++ b/boxtree/tree_build.py
@@ -42,17 +42,15 @@
 THE SOFTWARE.
 """
 
-
 import logging
 from functools import partial
 from itertools import pairwise
 
 import numpy as np
 
-import pyopencl as cl
-import pyopencl.array
 from pytools import DebugProcessLogger, ProcessLogger, memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.tree import Tree
 
 
@@ -71,26 +69,26 @@ class TreeBuilder:
     .. automethod:: __call__
     """
 
-    def __init__(self, context):
-        """
-        :arg context: A :class:`pyopencl.Context`.
-        """
+    morton_nr_dtype = np.dtype(np.int8)
+    box_level_dtype = np.dtype(np.uint8)
+    ROOT_EXTENT_STRETCH_FACTOR = 1e-4
 
-        self.context = context
+    def __init__(self, array_context: PyOpenCLArrayContext) -> None:
+        self._setup_actx = array_context
 
         from boxtree.bounding_box import BoundingBoxFinder
-        self.bbox_finder = BoundingBoxFinder(self.context)
+        self.bbox_finder = BoundingBoxFinder(array_context)
 
         # This is used to map box IDs and compress box lists in empty leaf
         # pruning.
 
         from boxtree.tools import GappyCopyAndMapKernel, MapValuesKernel
-        self.gappy_copy_and_map = GappyCopyAndMapKernel(self.context)
-        self.map_values_kernel = MapValuesKernel(self.context)
+        self.gappy_copy_and_map = GappyCopyAndMapKernel(array_context)
+        self.map_values_kernel = MapValuesKernel(array_context)
 
-    morton_nr_dtype = np.dtype(np.int8)
-    box_level_dtype = np.dtype(np.uint8)
-    ROOT_EXTENT_STRETCH_FACTOR = 1e-4
+    @property
+    def context(self):
+        return self._setup_actx.queue.context
 
     @memoize_method
     def get_kernel_info(self, dimensions, coord_dtype,
@@ -107,7 +105,7 @@ def get_kernel_info(self, dimensions, coord_dtype,
 
     # {{{ run control
 
-    def __call__(self, queue, particles, kind="adaptive",
+    def __call__(self, actx: PyOpenCLArrayContext, particles, kind="adaptive",
             max_particles_in_box=None, allocator=None, debug=False,
             targets=None, source_radii=None, target_radii=None,
             stick_out_factor=None, refine_weights=None,
@@ -115,7 +113,6 @@ def __call__(self, queue, particles, kind="adaptive",
             extent_norm=None, bbox=None,
             **kwargs):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue` instance
         :arg particles: an object array of (XYZ) point coordinate arrays.
         :arg kind: One of the following strings:
 
@@ -129,15 +126,14 @@ def __call__(self, queue, particles, kind="adaptive",
         :arg targets: an object array of (XYZ) point coordinate arrays or ``None``.
             If ``None``, *particles* act as targets, too.
             Must have the same (inner) dtype as *particles*.
-        :arg source_radii: If not *None*, a :class:`pyopencl.array.Array` of the
-            same dtype as *particles*.
+        :arg source_radii: If not *None*, an arra of the same dtype as *particles*.
 
             If this is given, *targets* must also be given, i.e. sources and
             targets must be separate. See :ref:`extent`.
 
         :arg target_radii: Like *source_radii*, but for targets.
         :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`.
-        :arg refine_weights: If not *None*, a :class:`pyopencl.array.Array` of the
+        :arg refine_weights: If not *None*, an array of the
             type :class:`numpy.int32`. A box will be split if it has a cumulative
             refine_weight greater than *max_leaf_refine_weight*. If this is given,
             *max_leaf_refine_weight* must also be given and *max_particles_in_box*
@@ -170,6 +166,12 @@ def __call__(self, queue, particles, kind="adaptive",
             management.
         """
 
+        if allocator is not None:
+            from warnings import warn
+            warn("Passing in 'allocator' is deprecated. The allocator of the "
+                "array context 'actx' is used throughout.",
+                DeprecationWarning, stacklevel=2)
+
         # {{{ input processing
 
         if kind not in ["adaptive", "adaptive-level-restricted", "non-adaptive"]:
@@ -241,19 +243,21 @@ def __call__(self, queue, particles, kind="adaptive",
 
         # }}}
 
-        empty = partial(cl.array.empty, queue, allocator=allocator)
-
         def zeros(shape, dtype):
-            result = cl.array.zeros(queue, shape, dtype, allocator=allocator)
+            result = actx.zeros(shape, dtype)
+
             if result.events:
                 event, = result.events
             else:
                 from numbers import Number
                 if isinstance(shape, Number):
                     shape = (shape,)
+
                 from pytools import product
                 assert product(shape) == 0
-                event = cl.enqueue_marker(queue)
+
+                from pyopencl import enqueue_marker
+                event = enqueue_marker(actx.queue)
 
             return result, event
 
@@ -277,7 +281,7 @@ def zeros(shape, dtype):
             else:
                 from pytools.obj_array import make_obj_array
                 srcntgts = make_obj_array([
-                    p.with_queue(queue).copy() for p in particles
+                    actx.np.copy(actx.thaw(p)) for p in particles
                     ])
 
             assert source_radii is None
@@ -301,7 +305,7 @@ def zeros(shape, dtype):
             def combine_srcntgt_arrays(ary1, ary2=None):
                 dtype = ary1.dtype if ary2 is None else ary2.dtype
 
-                result = empty(nsrcntgts, dtype)
+                result = actx.np.zeros(nsrcntgts, dtype)
                 if (ary1 is None) or (ary2 is None):
                     result.fill(0)
 
@@ -329,8 +333,9 @@ def combine_srcntgt_arrays(ary1, ary2=None):
 
         del particles
 
-        user_srcntgt_ids = cl.array.arange(queue, nsrcntgts, dtype=particle_id_dtype,
-                allocator=allocator)
+        user_srcntgt_ids = actx.from_numpy(
+            np.arange(nsrcntgts, dtype=particle_id_dtype)
+            )
 
         evt, = user_srcntgt_ids.events
         wait_for.append(evt)
@@ -353,28 +358,31 @@ def combine_srcntgt_arrays(ary1, ary2=None):
             raise ValueError("must specify either max_particles_in_box or "
                     "refine_weights/max_leaf_refine_weight")
         elif specified_max_particles_in_box:
-            refine_weights = (
-                cl.array.empty(
-                    queue, nsrcntgts, refine_weight_dtype, allocator=allocator)
-                .fill(1))
-            event, = refine_weights.events
-            prep_events.append(event)
+            refine_weights = actx.np.zeros(nsrcntgts, refine_weight_dtype)
+            refine_weights.fill(1)
+
+            prep_events.extend(refine_weights.events)
             max_leaf_refine_weight = max_particles_in_box
         elif specified_refine_weights:  # noqa: SIM102
             if refine_weights.dtype != refine_weight_dtype:
                 raise TypeError(
                         f"refine_weights must have dtype '{refine_weight_dtype}'")
 
-        if max_leaf_refine_weight < cl.array.max(refine_weights).get():
+        if max_leaf_refine_weight <= 0:
+            raise ValueError("max_leaf_refine_weight must be positive")
+
+        max_refine_weights = actx.to_numpy(actx.np.amax(refine_weights))
+        if max_leaf_refine_weight < max_refine_weights:
             raise ValueError(
                     "entries of refine_weights cannot exceed max_leaf_refine_weight")
-        if cl.array.min(refine_weights).get() < 0:
+
+        min_refine_weights = actx.to_numpy(actx.np.amin(refine_weights))
+        if min_refine_weights < 0:
             raise ValueError("all entries of refine_weights must be nonnegative")
-        if max_leaf_refine_weight <= 0:
-            raise ValueError("max_leaf_refine_weight must be positive")
 
-        total_refine_weight = cl.array.sum(
-                refine_weights, dtype=np.dtype(np.int64)).get()
+        total_refine_weight = actx.to_numpy(
+            actx.np.sum(refine_weights, dtype=np.dtype(np.int64))
+            )
 
         del max_particles_in_box
         del specified_max_particles_in_box
@@ -384,10 +392,12 @@ def combine_srcntgt_arrays(ary1, ary2=None):
 
         # {{{ find and process bounding box
 
-        if bbox is None:
-            bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for)
-            bbox = bbox.get()
+        bbox_auto, _ = self.bbox_finder(
+                actx, srcntgts, srcntgt_radii, wait_for=wait_for)
+        bbox_auto = actx.to_numpy(bbox_auto)
 
+        if bbox is None:
+            bbox = bbox_auto
             root_extent = max(
                 bbox["max_"+ax] - bbox["min_"+ax]
                 for ax in axis_names) * (1+TreeBuilder.ROOT_EXTENT_STRETCH_FACTOR)
@@ -403,10 +413,6 @@ def combine_srcntgt_arrays(ary1, ary2=None):
                 bbox["max_"+ax] = bbox_max[i]
         else:
             # Validate that bbox is a superset of particle-derived bbox
-            bbox_auto, _ = self.bbox_finder(
-                    srcntgts, srcntgt_radii, wait_for=wait_for)
-            bbox_auto = bbox_auto.get()
-
             # Convert unstructured numpy array to bbox_type
             if isinstance(bbox, np.ndarray):
                 if len(bbox) == dimensions:
@@ -447,11 +453,12 @@ def combine_srcntgt_arrays(ary1, ary2=None):
 
         # box-local morton bin counts for each particle at the current level
         # only valid from scan -> split'n'sort
-        morton_bin_counts = empty(nsrcntgts, dtype=knl_info.morton_bin_count_dtype)
+        morton_bin_counts = actx.np.zeros(
+            nsrcntgts, dtype=knl_info.morton_bin_count_dtype)
 
         # (local) morton nrs for each particle at the current level
         # only valid from scan -> split'n'sort
-        morton_nrs = empty(nsrcntgts, dtype=self.morton_nr_dtype)
+        morton_nrs = actx.np.zeros(nsrcntgts, dtype=self.morton_nr_dtype)
 
         # 0/1 segment flags
         # invariant to sorting once set
@@ -528,8 +535,7 @@ def combine_srcntgt_arrays(ary1, ary2=None):
         prep_events.append(evt)
 
         # Initialize box 0 to contain all particles
-        box_srcntgt_counts_cumul[0].fill(
-                nsrcntgts, queue=queue, wait_for=[evt])
+        box_srcntgt_counts_cumul[0].fill(nsrcntgts, queue=actx.queue, wait_for=[evt])
 
         # box -> whether the box has a child. FIXME: use smaller integer type
         box_has_children, evt = zeros(nboxes_guess, dtype=np.dtype(np.int32))
@@ -543,8 +549,10 @@ def combine_srcntgt_arrays(ary1, ary2=None):
         prep_events.append(evt)
 
         # set parent of root box to itself
-        evt = cl.enqueue_copy(
-                queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype))
+        from pyopencl import enqueue_copy
+        evt = enqueue_copy(
+                actx.queue, box_parent_ids.data,
+                np.zeros((), dtype=box_parent_ids.dtype))
         prep_events.append(evt)
 
         # 2*(num bits in the significand)
@@ -562,9 +570,9 @@ def combine_srcntgt_arrays(ary1, ary2=None):
 
         # }}}
 
-        def fin_debug(s):
+        def debug_with_finish(s):
             if debug:
-                queue.finish()
+                actx.queue.finish()
 
             logger.debug(s)
 
@@ -625,6 +633,7 @@ def fin_debug(s):
         # regarding this). This flag is set to True when that happens.
         final_level_restrict_iteration = False
 
+        from pyopencl import wait_for_events
         while level:
             if debug:
                 # More invariants:
@@ -652,7 +661,7 @@ def fin_debug(s):
                     + ((srcntgt_radii,) if srcntgts_have_extent else ())
                     )
 
-            fin_debug("morton count scan")
+            debug_with_finish("morton count scan")
 
             morton_count_args = common_args
             if srcntgts_have_extent:
@@ -660,11 +669,11 @@ def fin_debug(s):
 
             # writes: box_morton_bin_counts
             evt = knl_info.morton_count_scan(
-                    *morton_count_args, queue=queue, size=nsrcntgts,
+                    *morton_count_args, queue=actx.queue, size=nsrcntgts,
                     wait_for=wait_for)
             wait_for = [evt]
 
-            fin_debug("split box id scan")
+            debug_with_finish("split box id scan")
 
             # writes: box_has_children, split_box_ids
             evt = knl_info.split_box_id_scan(
@@ -684,7 +693,7 @@ def fin_debug(s):
                     split_box_ids,
                     have_oversize_split_box,
 
-                    queue=queue,
+                    queue=actx.queue,
                     size=level_start_box_nrs[level],
                     wait_for=wait_for)
             wait_for = [evt]
@@ -698,7 +707,7 @@ def fin_debug(s):
                 last_box_on_prev_level = level_start_box_id - 1
                 new_level_used_box_counts.append(
                     # FIXME: Get this all at once.
-                    int(split_box_ids[last_box_on_prev_level].get())
+                    int(actx.to_numpy(split_box_ids[last_box_on_prev_level]))
                     - level_start_box_id)
 
             # New leaf count =
@@ -743,7 +752,7 @@ def fin_debug(s):
             # have_oversize_split_box = 0), then we do not need to allocate any
             # extra space, since no new leaves can be created at the bottom
             # level.
-            if knl_info.level_restrict and have_oversize_split_box.get():
+            if knl_info.level_restrict and actx.to_numpy(have_oversize_split_box):
                 # Currently undocumented.
                 lr_lookbehind_levels = kwargs.get("lr_lookbehind", 1)
                 minimal_new_level_length += sum(
@@ -792,18 +801,17 @@ def fin_debug(s):
 
                 old_box_count = level_start_box_nrs[-1]
                 # Where should I put this box?
-                dst_box_id = cl.array.empty(queue,
-                        shape=old_box_count, dtype=box_id_dtype)
+                dst_box_id = actx.np.zeros(shape=old_box_count, dtype=box_id_dtype)
 
                 for level_start, new_level_start, level_len in zip(
                         level_start_box_nrs[:-1],
                         new_level_start_box_nrs[:-1],
                         curr_upper_level_lengths, strict=True):
-                    dst_box_id[level_start:level_start + level_len] = \
-                            cl.array.arange(queue,
-                                            new_level_start,
-                                            new_level_start + level_len,
-                                            dtype=box_id_dtype)
+                    dst_box_id[level_start:level_start+level_len] = actx.from_numpy(
+                        np.arange(new_level_start,
+                                  new_level_start + level_len,
+                                  dtype=box_id_dtype)
+                        )
 
                 wait_for.extend(dst_box_id.events)
 
@@ -843,28 +851,27 @@ def fin_debug(s):
             # {{{ reallocate and/or renumber boxes if necessary
 
             if level_start_box_nrs_updated or nboxes_new > nboxes_guess:
-                fin_debug("starting nboxes_guess increase")
+                debug_with_finish("starting nboxes_guess increase")
 
                 while nboxes_guess < nboxes_new:
                     nboxes_guess *= 2
 
                 def my_realloc_nocopy(ary, shape=nboxes_guess):
-                    return cl.array.empty(queue, allocator=allocator,
-                            shape=shape, dtype=ary.dtype)
+                    return actx.zeros(shape=shape, dtype=ary.dtype)
 
                 def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
-                    result = cl.array.zeros(queue, allocator=allocator,
-                            shape=shape, dtype=ary.dtype)
+                    result = actx.zeros(shape=shape, dtype=ary.dtype)
                     return result, result.events[0]
 
-                my_realloc = partial(realloc_array,
-                        queue, allocator, nboxes_guess, wait_for=wait_for)
-                my_realloc_zeros = partial(realloc_array,
-                        queue, allocator, nboxes_guess, zero_fill=True,
-                        wait_for=wait_for)
-                my_realloc_zeros_and_renumber = partial(realloc_and_renumber_array,
-                        queue, allocator, nboxes_guess, zero_fill=True,
-                        wait_for=wait_for)
+                my_realloc = partial(
+                    realloc_array,
+                    actx, nboxes_guess, wait_for=wait_for)
+                my_realloc_zeros = partial(
+                    realloc_array,
+                    actx, nboxes_guess, zero_fill=True, wait_for=wait_for)
+                my_realloc_zeros_and_renumber = partial(
+                    realloc_and_renumber_array,
+                    actx, nboxes_guess, zero_fill=True, wait_for=wait_for)
 
                 resize_events = []
 
@@ -875,8 +882,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
                 # only the box morton bin counts of boxes on the level
                 # currently being processed are written-but we need to
                 # retain the box morton bin counts from the higher levels.
-                box_morton_bin_counts, evt = my_realloc_zeros(
-                        box_morton_bin_counts)
+                box_morton_bin_counts, evt = my_realloc_zeros(box_morton_bin_counts)
                 resize_events.append(evt)
 
                 # force_split_box is unused unless level restriction is enabled.
@@ -911,7 +917,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
                     resize_events.append(evt)
                 else:
                     box_levels, evt = my_realloc_zeros_nocopy(box_levels)
-                    cl.wait_for_events([evt])
+                    wait_for_events([evt])
                     for box_level, (level_start, level_end) in enumerate(
                             pairwise(level_start_box_nrs)):
                         box_levels[level_start:level_end].fill(box_level)
@@ -977,9 +983,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
                     if level_nboxes == 0:
                         assert leaf_count == 0
                         continue
-                    nleaves_actual = level_nboxes - int(
-                        cl.array.sum(box_has_children[
-                            level_start:level_start + level_nboxes]).get())
+                    nleaves_actual = level_nboxes - int(actx.to_numpy(
+                        actx.np.sum(
+                            box_has_children[level_start:level_start + level_nboxes]
+                            )
+                        ))
                     assert leaf_count == nleaves_actual
 
             # Can't del in Py2.7 - see note below
@@ -1006,7 +1014,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
             wait_for = [evt]
 
-            fin_debug("box splitter")
+            debug_with_finish("box splitter")
 
             # Mark the levels of boxes added for padding (these were not updated
             # by the box splitter kernel).
@@ -1017,20 +1025,20 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
             if debug:
                 box_levels.finish()
-                level_bl_chunk = box_levels.get()[
+                level_bl_chunk = actx.to_numpy(box_levels)[
                         level_start_box_nrs[-2]:level_start_box_nrs[-1]]
-                assert (level_bl_chunk == level).all()
+                assert np.all(level_bl_chunk == level)
                 del level_bl_chunk
 
             if debug:
-                assert (box_srcntgt_starts.get() < nsrcntgts).all()
+                assert np.all(actx.to_numpy(box_srcntgt_starts) < nsrcntgts)
 
             # }}}
 
             # {{{ renumber particles within split boxes
 
-            new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids)
-            new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids)
+            new_user_srcntgt_ids = actx.np.zeros_like(user_srcntgt_ids)
+            new_srcntgt_box_ids = actx.np.zeros_like(srcntgt_box_ids)
 
             particle_renumberer_args = (
                 *common_args,
@@ -1044,7 +1052,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
             wait_for = [evt]
 
-            fin_debug("particle renumbering")
+            debug_with_finish("particle renumbering")
 
             user_srcntgt_ids = new_user_srcntgt_ids
             del new_user_srcntgt_ids
@@ -1066,7 +1074,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
                 # reallocation code. In order to fix this issue, the box
                 # numbering and reallocation code needs to be accessible after
                 # the final level restriction is done.
-                assert int(have_oversize_split_box.get()) == 0
+                assert int(actx.to_numpy(have_oversize_split_box)) == 0
                 assert level_used_box_counts[-1] == 0
                 del level_used_box_counts[-1]
                 del level_start_box_nrs[-1]
@@ -1123,10 +1131,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
                     if debug:
                         force_split_box.finish()
-                        boxes_split.append(int(cl.array.sum(
-                            force_split_box[upper_level_slice]).get()))
+                        boxes_split.append(int(actx.to_numpy(
+                            actx.np.sum(force_split_box[upper_level_slice])
+                            )))
 
-                    if int(have_upper_level_split_box.get()) == 0:
+                    if int(actx.to_numpy(have_upper_level_split_box)) == 0:
                         break
 
                     did_upper_level_split = True
@@ -1141,7 +1150,8 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
                         logger.debug("level %d: %d boxes split", level_, nboxes_split)
                     del boxes_split
 
-                if int(have_oversize_split_box.get()) == 0 and did_upper_level_split:
+                if (int(actx.to_numpy(have_oversize_split_box)) == 0
+                        and did_upper_level_split):
                     # We are in the situation where there are boxes left to
                     # split on upper levels, and the level loop is done creating
                     # lower levels.
@@ -1154,7 +1164,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
             # }}}
 
-            if not int(have_oversize_split_box.get()):
+            if not int(actx.to_numpy(have_oversize_split_box)):
                 logger.debug("no boxes left to split")
                 break
 
@@ -1164,9 +1174,11 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
             # {{{ check that nonchild part of box_morton_bin_counts is consistent
 
             if debug and 0:
-                h_box_morton_bin_counts = box_morton_bin_counts.get()
-                h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
-                h_box_child_ids = tuple(bci.get() for bci in box_child_ids)
+                h_box_morton_bin_counts = actx.to_numpy(box_morton_bin_counts)
+                h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
+                h_box_child_ids = tuple(
+                    actx.to_numpy(bci) for bci in box_child_ids
+                    )
 
                 has_mismatch = False
                 for ibox in range(level_start_box_nrs[-1]):
@@ -1213,8 +1225,8 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
         # {{{ extract number of non-child srcntgts from box morton counts
 
         if srcntgts_have_extent:
-            box_srcntgt_counts_nonchild = empty(nboxes, particle_id_dtype)
-            fin_debug("extract non-child srcntgt count")
+            box_srcntgt_counts_nonchild = actx.np.zeros(nboxes, particle_id_dtype)
+            debug_with_finish("extract non-child srcntgt count")
 
             assert len(level_start_box_nrs) >= 2
             highest_possibly_split_box_nr = level_start_box_nrs[-2]
@@ -1234,11 +1246,13 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
             del highest_possibly_split_box_nr
 
             if debug:
-                h_box_srcntgt_counts_nonchild = box_srcntgt_counts_nonchild.get()
-                h_box_srcntgt_counts_cumul = box_srcntgt_counts_cumul.get()
+                h_box_srcntgt_counts_nonchild = (
+                    actx.to_numpy(box_srcntgt_counts_nonchild))
+                h_box_srcntgt_counts_cumul = actx.to_numpy(box_srcntgt_counts_cumul)
 
-                assert (h_box_srcntgt_counts_nonchild
-                        <= h_box_srcntgt_counts_cumul[:nboxes]).all()
+                assert np.all(
+                    h_box_srcntgt_counts_nonchild
+                    <= h_box_srcntgt_counts_cumul[:nboxes])
 
                 del h_box_srcntgt_counts_nonchild
 
@@ -1256,7 +1270,7 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
 
         if prune_empty_leaves:
             # What is the original index of this box?
-            src_box_id = empty(nboxes, box_id_dtype)
+            src_box_id = actx.np.zeros(nboxes, box_id_dtype)
 
             # Where should I put this box?
             #
@@ -1265,37 +1279,39 @@ def my_realloc_zeros_nocopy(ary, shape=nboxes_guess):
             dst_box_id, evt = zeros(nboxes, box_id_dtype)
             wait_for.append(evt)
 
-            fin_debug("find prune indices")
+            debug_with_finish("find prune indices")
 
-            nboxes_post_prune_dev = empty((), dtype=box_id_dtype)
+            nboxes_post_prune_dev = actx.np.zeros((), dtype=box_id_dtype)
             evt = knl_info.find_prune_indices_kernel(
                     box_srcntgt_counts_cumul,
                     src_box_id, dst_box_id, nboxes_post_prune_dev,
                     size=nboxes, wait_for=wait_for)
             wait_for = [evt]
-            nboxes_post_prune = int(nboxes_post_prune_dev.get())
+            nboxes_post_prune = int(actx.to_numpy(nboxes_post_prune_dev))
             logger.debug("%d boxes after pruning "
-                        "(%d empty leaves and/or unused boxes removed)",
-                        nboxes_post_prune, nboxes - nboxes_post_prune)
+                         "(%d empty leaves and/or unused boxes removed)",
+                         nboxes_post_prune, nboxes - nboxes_post_prune)
             should_prune = True
         elif knl_info.level_restrict:
             # Remove unused boxes from the tree.
-            src_box_id = empty(nboxes, box_id_dtype)
-            dst_box_id = empty(nboxes, box_id_dtype)
+            src_box_id = actx.np.zeros(nboxes, box_id_dtype)
+            dst_box_id = actx.np.zeros(nboxes, box_id_dtype)
 
-            new_level_start_box_nrs = np.empty_like(level_start_box_nrs)
+            new_level_start_box_nrs = np.zeros_like(level_start_box_nrs)
             new_level_start_box_nrs[0] = 0
             new_level_start_box_nrs[1:] = np.cumsum(level_used_box_counts)
             for level_start, new_level_start, level_used_box_count in zip(
                     level_start_box_nrs[:-1],
                     new_level_start_box_nrs[:-1],
                     level_used_box_counts, strict=True):
+
                 def make_slice(start, offset=level_used_box_count):
                     return slice(start, start + offset)
 
                 def make_arange(start, offset=level_used_box_count):
-                    return cl.array.arange(
-                            queue, start, start + offset, dtype=box_id_dtype)
+                    return actx.from_numpy(
+                        np.arange(start, start + offset, dtype=box_id_dtype)
+                        )
 
                 src_box_id[make_slice(new_level_start)] = make_arange(level_start)
                 dst_box_id[make_slice(level_start)] = make_arange(new_level_start)
@@ -1313,7 +1329,7 @@ def make_arange(start, offset=level_used_box_count):
             prune_events = []
 
             prune_empty = partial(self.gappy_copy_and_map,
-                    queue, allocator, nboxes_post_prune,
+                    actx, nboxes_post_prune,
                     src_indices=src_box_id,
                     range=slice(nboxes_post_prune), debug=debug)
 
@@ -1324,7 +1340,7 @@ def make_arange(start, offset=level_used_box_count):
             prune_events.append(evt)
 
             if debug and prune_empty_leaves:
-                assert (box_srcntgt_counts_cumul.get() > 0).all()
+                assert np.all(actx.to_numpy(box_srcntgt_counts_cumul) > 0)
 
             srcntgt_box_ids, evt = self.map_values_kernel(
                     dst_box_id, srcntgt_box_ids)
@@ -1358,10 +1374,11 @@ def make_arange(start, offset=level_used_box_count):
 
             evt = knl_info.find_level_box_counts_kernel(
                 box_levels, level_used_box_counts_dev)
-            cl.wait_for_events([evt])
+            wait_for_events([evt])
 
             nlevels = len(level_used_box_counts)
-            level_used_box_counts = level_used_box_counts_dev[:nlevels].get()
+            level_used_box_counts = (
+                actx.to_numpy(level_used_box_counts_dev[:nlevels]))
 
             level_start_box_nrs = [0]
             level_start_box_nrs.extend(np.cumsum(level_used_box_counts))
@@ -1386,7 +1403,7 @@ def make_arange(start, offset=level_used_box_count):
         if targets is None:
             from boxtree.tools import reverse_index_array
             user_source_ids = user_srcntgt_ids
-            sorted_target_ids = reverse_index_array(user_srcntgt_ids)
+            sorted_target_ids = reverse_index_array(actx, user_srcntgt_ids)
 
             box_source_starts = box_target_starts = box_srcntgt_starts
             box_source_counts_cumul = box_target_counts_cumul = \
@@ -1395,18 +1412,18 @@ def make_arange(start, offset=level_used_box_count):
                 box_source_counts_nonchild = box_target_counts_nonchild = \
                         box_srcntgt_counts_nonchild
         else:
-            source_numbers = empty(nsrcntgts, particle_id_dtype)
+            source_numbers = actx.np.zeros(nsrcntgts, particle_id_dtype)
 
-            fin_debug("source counter")
+            debug_with_finish("source counter")
             evt = knl_info.source_counter(user_srcntgt_ids, nsources,
-                    source_numbers, queue=queue, allocator=allocator,
+                    source_numbers, queue=actx.queue, allocator=actx.allocator,
                     wait_for=wait_for)
             wait_for = [evt]
 
-            user_source_ids = empty(nsources, particle_id_dtype)
+            user_source_ids = actx.np.zeros(nsources, particle_id_dtype)
             # srcntgt_target_ids is temporary until particle permutation is done
-            srcntgt_target_ids = empty(ntargets, particle_id_dtype)
-            sorted_target_ids = empty(ntargets, particle_id_dtype)
+            srcntgt_target_ids = actx.np.zeros(ntargets, particle_id_dtype)
+            sorted_target_ids = actx.np.zeros(ntargets, particle_id_dtype)
 
             # need to use zeros because parent boxes won't be initialized
             box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype)
@@ -1429,7 +1446,7 @@ def make_arange(start, offset=level_used_box_count):
                         nboxes_post_prune, particle_id_dtype)
                 wait_for.append(evt)
 
-            fin_debug("source and target index finder")
+            debug_with_finish("source and target index finder")
             evt = knl_info.source_and_target_index_finder(*(
                 # input:
                 (
@@ -1453,31 +1470,32 @@ def make_arange(start, offset=level_used_box_count):
                     box_target_counts_nonchild,  # pylint: disable=possibly-used-before-assignment
                     ) if srcntgts_have_extent else ())
                 ),
-                queue=queue, range=slice(nsrcntgts),
+                queue=actx.queue, range=slice(nsrcntgts),
                 wait_for=wait_for)
             wait_for = [evt]
 
             if srcntgts_have_extent:  # noqa: SIM102
                 if debug:
-                    assert (
-                            box_srcntgt_counts_nonchild.get()
-                            == (box_source_counts_nonchild
-                                + box_target_counts_nonchild).get()).all()
+                    assert np.all(actx.to_numpy(
+                        box_srcntgt_counts_nonchild
+                        == (box_source_counts_nonchild + box_target_counts_nonchild)
+                        ))
 
             if debug:
-                usi_host = user_source_ids.get()
-                assert (usi_host < nsources).all()
-                assert (usi_host >= 0).all()
+                usi_host = actx.to_numpy(user_source_ids)
+                assert np.all(usi_host < nsources)
+                assert np.all(usi_host >= 0)
                 del usi_host
 
-                sti_host = srcntgt_target_ids.get()
-                assert (sti_host < nsources+ntargets).all()
-                assert (nsources <= sti_host).all()
+                sti_host = actx.to_numpy(srcntgt_target_ids)
+                assert np.all(sti_host < nsources+ntargets)
+                assert np.all(nsources <= sti_host)
                 del sti_host
 
-                assert (box_source_counts_cumul.get()
-                        + box_target_counts_cumul.get()
-                        == box_srcntgt_counts_cumul.get()).all()
+                assert np.all(actx.to_numpy(
+                    box_source_counts_cumul + box_target_counts_cumul
+                    == box_srcntgt_counts_cumul
+                    ))
 
             del source_numbers
 
@@ -1490,10 +1508,9 @@ def make_arange(start, offset=level_used_box_count):
         # {{{ permute and source/target-split (if necessary) particle array
 
         if targets is None:
-            sources = targets = make_obj_array([
-                cl.array.empty_like(pt) for pt in srcntgts])
+            sources = targets = actx.np.zeros_like(srcntgts)
 
-            fin_debug("srcntgt permuter (particles)")
+            debug_with_finish("srcntgt permuter (particles)")
             evt = knl_info.srcntgt_permuter(
                     user_srcntgt_ids,
                     *(tuple(srcntgts) + tuple(sources)),
@@ -1504,34 +1521,37 @@ def make_arange(start, offset=level_used_box_count):
 
         else:
             sources = make_obj_array([
-                empty(nsources, coord_dtype) for i in range(dimensions)])
-            fin_debug("srcntgt permuter (sources)")
+                actx.np.zeros(nsources, coord_dtype) for i in range(dimensions)
+                ])
+            debug_with_finish("srcntgt permuter (sources)")
             evt = knl_info.srcntgt_permuter(
                     user_source_ids,
                     *(tuple(srcntgts) + tuple(sources)),
-                    queue=queue, range=slice(nsources),
+                    queue=actx.queue, range=slice(nsources),
                     wait_for=wait_for)
             wait_for = [evt]
 
             targets = make_obj_array([
-                empty(ntargets, coord_dtype) for i in range(dimensions)])
-            fin_debug("srcntgt permuter (targets)")
+                actx.np.zeros(ntargets, coord_dtype) for i in range(dimensions)
+                ])
+            debug_with_finish("srcntgt permuter (targets)")
             evt = knl_info.srcntgt_permuter(
                     srcntgt_target_ids,
                     *(tuple(srcntgts) + tuple(targets)),
-                    queue=queue, range=slice(ntargets),
+                    queue=actx.queue, range=slice(ntargets),
                     wait_for=wait_for)
             wait_for = [evt]
 
             if srcntgt_radii is not None:
-                fin_debug("srcntgt permuter (source radii)")
-                source_radii = cl.array.take(
-                        srcntgt_radii, user_source_ids, queue=queue,
+                import pyopencl.array as cl_array
+                debug_with_finish("srcntgt permuter (source radii)")
+                source_radii = cl_array.take(
+                        srcntgt_radii, user_source_ids, queue=actx.queue,
                         wait_for=wait_for)
 
-                fin_debug("srcntgt permuter (target radii)")
-                target_radii = cl.array.take(
-                        srcntgt_radii, srcntgt_target_ids, queue=queue,
+                debug_with_finish("srcntgt permuter (target radii)")
+                target_radii = cl_array.take(
+                        srcntgt_radii, srcntgt_target_ids, queue=actx.queue,
                         wait_for=wait_for)
 
                 wait_for = source_radii.events + target_radii.events
@@ -1549,7 +1569,7 @@ def make_arange(start, offset=level_used_box_count):
         assert nlevels == len(level_used_box_counts)
         assert level + 1 == nlevels, (level+1, nlevels)
         if debug:
-            max_level = np.max(box_levels.get())
+            max_level = np.max(actx.to_numpy(box_levels))
             assert max_level + 1 == nlevels
 
         # {{{ gather box child ids, box centers
@@ -1561,7 +1581,7 @@ def make_arange(start, offset=level_used_box_count):
 
         box_child_ids_new, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype)
         wait_for.append(evt)
-        box_centers_new = empty((dimensions, aligned_nboxes), coord_dtype)
+        box_centers_new = actx.np.zeros((dimensions, aligned_nboxes), coord_dtype)
 
         for mnr, child_row in enumerate(box_child_ids):
             box_child_ids_new[mnr, :nboxes_post_prune] = \
@@ -1572,7 +1592,7 @@ def make_arange(start, offset=level_used_box_count):
             box_centers_new[dim, :nboxes_post_prune] = center_row[:nboxes_post_prune]
         wait_for.extend(box_centers_new.events)
 
-        cl.wait_for_events(wait_for)
+        wait_for_events(wait_for)
 
         box_centers = box_centers_new
         box_child_ids = box_child_ids_new
@@ -1585,7 +1605,7 @@ def make_arange(start, offset=level_used_box_count):
         # {{{ compute box flags
 
         from boxtree.tree import box_flags_enum
-        box_flags = empty(nboxes_post_prune, box_flags_enum.dtype)
+        box_flags = actx.np.zeros(nboxes_post_prune, box_flags_enum.dtype)
 
         if not srcntgts_have_extent:
             # If srcntgts_have_extent, then non-child counts have already been
@@ -1624,7 +1644,7 @@ def make_arange(start, offset=level_used_box_count):
                         nboxes_post_prune, particle_id_dtype)
                 wait_for.append(evt)
 
-        fin_debug("compute box info")
+        debug_with_finish("compute box info")
         evt = knl_info.box_info_kernel(
                 *(
                     # input:
@@ -1648,27 +1668,23 @@ def make_arange(start, offset=level_used_box_count):
 
         # {{{ compute box bounding box
 
-        fin_debug("finding box extents")
+        debug_with_finish("finding box extents")
 
-        box_source_bounding_box_min = cl.array.empty(
-                queue, (dimensions, aligned_nboxes),
-                dtype=coord_dtype)
-        box_source_bounding_box_max = cl.array.empty(
-                queue, (dimensions, aligned_nboxes),
-                dtype=coord_dtype)
+        box_source_bounding_box_min = actx.np.zeros(
+            (dimensions, aligned_nboxes), dtype=coord_dtype)
+        box_source_bounding_box_max = actx.np.zeros(
+            (dimensions, aligned_nboxes), dtype=coord_dtype)
 
         if sources_are_targets:
             box_target_bounding_box_min = box_source_bounding_box_min
             box_target_bounding_box_max = box_source_bounding_box_max
         else:
-            box_target_bounding_box_min = cl.array.empty(
-                    queue, (dimensions, aligned_nboxes),
-                    dtype=coord_dtype)
-            box_target_bounding_box_max = cl.array.empty(
-                    queue, (dimensions, aligned_nboxes),
-                    dtype=coord_dtype)
+            box_target_bounding_box_min = actx.np.zeros(
+                    (dimensions, aligned_nboxes), dtype=coord_dtype)
+            box_target_bounding_box_max = actx.np.zeros(
+                    (dimensions, aligned_nboxes), dtype=coord_dtype)
 
-        bogus_radii_array = cl.array.empty(queue, 1, dtype=coord_dtype)
+        bogus_radii_array = actx.np.zeros(1, dtype=coord_dtype)
 
         # nlevels-1 is the highest valid level index
         for level in range(nlevels-1, -1, -1):
@@ -1720,7 +1736,7 @@ def make_arange(start, offset=level_used_box_count):
                         *args,
 
                         range=slice(start, stop),
-                        queue=queue, wait_for=wait_for)
+                        queue=actx.queue, wait_for=wait_for)
 
             wait_for = [evt]
 
@@ -1734,8 +1750,13 @@ def make_arange(start, offset=level_used_box_count):
 
         if sources_have_extent:
             extra_tree_attrs.update(source_radii=source_radii)
+        else:
+            extra_tree_attrs.update(source_radii=None)
+
         if targets_have_extent:
             extra_tree_attrs.update(target_radii=target_radii)
+        else:
+            extra_tree_attrs.update(target_radii=None)
 
         tree_build_proc.done(
                 "%d levels, %d boxes, %d particles, box extent norm: %s, "
@@ -1743,7 +1764,7 @@ def make_arange(start, offset=level_used_box_count):
                 nlevels, len(box_parent_ids), nsrcntgts, srcntgts_extent_norm,
                 max_leaf_refine_weight)
 
-        return Tree(
+        tree = Tree(
                 # If you change this, also change the documentation
                 # of what's in the tree, above.
                 sources_are_targets=sources_are_targets,
@@ -1755,13 +1776,12 @@ def make_arange(start, offset=level_used_box_count):
                 coord_dtype=coord_dtype,
                 box_level_dtype=self.box_level_dtype,
 
+                bounding_box=(bbox_min, bbox_max),
                 root_extent=root_extent,
                 stick_out_factor=stick_out_factor,
                 extent_norm=srcntgts_extent_norm,
 
-                bounding_box=(bbox_min, bbox_max),
-                level_start_box_nrs=level_start_box_nrs,
-                level_start_box_nrs_dev=level_start_box_nrs_dev,
+                level_start_box_nrs=actx.from_numpy(level_start_box_nrs),
 
                 sources=sources,
                 targets=targets,
@@ -1790,7 +1810,9 @@ def make_arange(start, offset=level_used_box_count):
                 _is_pruned=prune_empty_leaves,
 
                 **extra_tree_attrs
-                ).with_queue(None), evt
+                )
+
+        return actx.freeze(tree), evt
 
         # }}}
 

From 5d7027d337b0ef7b4b818efd0879957a9ac1b142 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 22 Jun 2022 15:20:55 +0300
Subject: [PATCH 11/28] port tree_build_kernels to arraycontext

---
 boxtree/tree_build_kernels.py | 47 +++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/boxtree/tree_build_kernels.py b/boxtree/tree_build_kernels.py
index 3477fa32..5624c690 100644
--- a/boxtree/tree_build_kernels.py
+++ b/boxtree/tree_build_kernels.py
@@ -21,20 +21,24 @@
 """
 
 import logging
+from dataclasses import dataclass
 from functools import partial
 
 import numpy as np
 from mako.template import Template
 
-from pyopencl.elementwise import ElementwiseTemplate
-from pyopencl.scan import ScanTemplate
-from pytools import Record, log_process, memoize
+from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate
+from pyopencl.scan import GenericScanKernel, ScanTemplate
+from pytools import log_process, memoize
 
 from boxtree.tools import (
+    ScalarArg,
+    VectorArg,
     coord_vec_subscript_code,
     get_coord_vec_dtype,
     get_type_moniker,
 )
+from boxtree.traversal import HELPER_FUNCTION_TEMPLATE, TRAVERSAL_PREAMBLE_MAKO_DEFS
 
 
 logger = logging.getLogger(__name__)
@@ -121,8 +125,27 @@
 # -----------------------------------------------------------------------------
 
 
-class _KernelInfo(Record):
-    pass
+@dataclass(frozen=True)
+class _KernelInfo:
+    particle_id_dtype: np.dtype
+    box_id_dtype: np.dtype
+    morton_bin_count_dtype: np.dtype
+
+    morton_count_scan: GenericScanKernel
+    split_box_id_scan: GenericScanKernel
+    box_splitter_kernel: ElementwiseKernel
+    particle_renumberer_kernel: ElementwiseKernel
+    level_restrict: bool
+    level_restrict_kernel_builder: ElementwiseKernel | None
+
+    extract_nonchild_srcntgt_count_kernel: ElementwiseKernel | None
+    find_prune_indices_kernel: GenericScanKernel
+    find_level_box_counts_kernel: GenericScanKernel
+    srcntgt_permuter: ElementwiseKernel
+    source_counter: GenericScanKernel
+    source_and_target_index_finder: ElementwiseKernel | None
+    box_info_kernel: ElementwiseKernel
+    box_extents_finder_kernel: ElementwiseKernel
 
 
 # {{{ data types
@@ -797,9 +820,6 @@ def get_count_for_branch(known_bits):
 
 # {{{ level restrict kernel
 
-from boxtree.traversal import TRAVERSAL_PREAMBLE_MAKO_DEFS
-
-
 LEVEL_RESTRICT_TPL = Template(
     TRAVERSAL_PREAMBLE_MAKO_DEFS + r"""//CL:mako//
     <%def name="my_load_center(name, box_id)">
@@ -933,8 +953,6 @@ def build_level_restrict_kernel(context, preamble_with_dtype_decls,
 
     from pyopencl.elementwise import ElementwiseKernel
 
-    from boxtree.traversal import HELPER_FUNCTION_TEMPLATE
-
     return ElementwiseKernel(
             context,
             arguments=arguments,
@@ -1400,7 +1418,7 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
     if np.iinfo(box_id_dtype).min == 0:
         from warnings import warn
         warn("Careful with unsigned types for box_id_dtype. Some CL implementations "
-                "(notably Intel 2012) mis-implemnet unsigned operations, leading to "
+                "(notably Intel 2012) mis-implement unsigned operations, leading to "
                 "incorrect results.", stacklevel=4)
 
     from pyopencl.tools import dtype_to_c_struct, dtype_to_ctype
@@ -1471,7 +1489,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
             + str(MORTON_NR_SCAN_PREAMBLE_TPL.render(**codegen_args))
             )
 
-    from boxtree.tools import ScalarArg, VectorArg
     common_arguments = (
             [
                 # box-local morton bin counts for each particle at the current level
@@ -1533,7 +1550,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
             (ScalarArg(coord_dtype, "stick_out_factor"))
         ]
 
-    from pyopencl.scan import GenericScanKernel
     morton_count_scan = GenericScanKernel(
             context, morton_bin_count_dtype,
             arguments=morton_count_scan_arguments,
@@ -1557,7 +1573,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
 
     # {{{ split_box_id scan
 
-    from pyopencl.scan import GenericScanKernel
     split_box_id_scan = SPLIT_BOX_ID_SCAN_TPL.build(
             context,
             type_aliases=(
@@ -1592,7 +1607,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
 
     box_splitter_kernel_source = BOX_SPLITTER_KERNEL_TPL.render(**box_s_codegen_args)
 
-    from pyopencl.elementwise import ElementwiseKernel
     box_splitter_kernel = ElementwiseKernel(
             context,
             common_arguments
@@ -1627,7 +1641,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
     particle_renumberer_kernel_source = \
             PARTICLE_RENUMBERER_KERNEL_TPL.render(**codegen_args)
 
-    from pyopencl.elementwise import ElementwiseKernel
     particle_renumberer_kernel = ElementwiseKernel(
             context,
             [*common_arguments,
@@ -1679,7 +1692,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
 
     # FIXME: Turn me into a scan template
 
-    from boxtree.tools import VectorArg
     find_prune_indices_kernel = GenericScanKernel(
             context, box_id_dtype,
             arguments=[
@@ -1753,7 +1765,6 @@ def get_tree_build_kernel_info(context, dimensions, coord_dtype,
     # really a loss.
 
     # FIXME: make me a scan template
-    from pyopencl.scan import GenericScanKernel
     source_counter = GenericScanKernel(
             context, box_id_dtype,
             arguments=[

From c8bd1a2c7543ea3471c7e41373cc8043d72b4bf0 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 23 Jun 2022 17:18:30 +0300
Subject: [PATCH 12/28] port pyfmmlib_integration to arraycontext

---
 boxtree/pyfmmlib_integration.py | 38 ++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py
index 07a54b74..869805b6 100644
--- a/boxtree/pyfmmlib_integration.py
+++ b/boxtree/pyfmmlib_integration.py
@@ -35,21 +35,21 @@
 THE SOFTWARE.
 """
 
-
-import logging
-
-
-logger = logging.getLogger(__name__)
 import enum
+import logging
 
 import numpy as np
 
 from pytools import log_process, memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
 from boxtree.timing import return_timing_data
 
 
+logger = logging.getLogger(__name__)
+
+
 # {{{ rotation data interface
 
 class FMMLibRotationDataInterface:
@@ -80,8 +80,8 @@ class FMMLibRotationData(FMMLibRotationDataInterface):
     .. automethod:: __init__
     """
 
-    def __init__(self, queue, trav):
-        self.queue = queue
+    def __init__(self, array_context: PyOpenCLArrayContext, trav):
+        self._setup_actx = array_context
         self.trav = trav
         self.tree = trav.tree
 
@@ -89,27 +89,27 @@ def __init__(self, queue, trav):
     @memoize_method
     def rotation_classes_builder(self):
         from boxtree.rotation_classes import RotationClassesBuilder
-        return RotationClassesBuilder(self.queue.context)
+        return RotationClassesBuilder(self._setup_actx)
 
     @memoize_method
     def build_rotation_classes_lists(self):
-        trav = self.trav.to_device(self.queue)
-        tree = self.tree.to_device(self.queue)
-        return self.rotation_classes_builder(self.queue, trav, tree)[0]
+        trav = self._setup_actx.from_numpy(self.trav)
+        tree = self._setup_actx.from_numpy(self.tree)
+        return self.rotation_classes_builder(self._setup_actx, trav, tree)[0]
 
     @memoize_method
     def m2l_rotation_lists(self):
-        return (self
-                .build_rotation_classes_lists()
-                .from_sep_siblings_rotation_classes
-                .get(self.queue))
+        return self._setup_actx.to_numpy(
+            self.build_rotation_classes_lists()
+                .from_sep_siblings_rotation_classes,
+            )
 
     @memoize_method
     def m2l_rotation_angles(self):
-        return (self
-                .build_rotation_classes_lists()
-                .from_sep_siblings_rotation_class_to_angle
-                .get(self.queue))
+        return self._setup_actx.to_numpy(
+            self.build_rotation_classes_lists()
+                .from_sep_siblings_rotation_class_to_angle,
+            )
 
 
 class FMMLibRotationDataNotSuppliedWarning(UserWarning):

From 34cd81c2c81b110540cf425af6ecf0f27bd880fc Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 23 Jun 2022 21:29:17 +0300
Subject: [PATCH 13/28] port cost to arraycontext

---
 boxtree/cost.py | 473 +++++++++++++++++++++++-------------------------
 1 file changed, 228 insertions(+), 245 deletions(-)

diff --git a/boxtree/cost.py b/boxtree/cost.py
index 7d2cc952..3a991a12 100644
--- a/boxtree/cost.py
+++ b/boxtree/cost.py
@@ -58,6 +58,7 @@
 .. autoclass:: FMMCostModel
 """
 
+from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from functools import partial
 from typing import ClassVar
@@ -65,17 +66,15 @@
 import numpy as np
 from mako.template import Template
 
-import pyopencl as cl
-import pyopencl.array
 from pymbolic import evaluate, var
 from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
-from pytools import memoize_method
+from pytools import keyed_memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
 
-Template = partial(Template, strict_undefined=True)
 
-from abc import ABC, abstractmethod
+Template = partial(Template, strict_undefined=True)
 
 
 # {{{ FMMTranslationCostModel
@@ -218,6 +217,7 @@ class AbstractFMMCostModel(ABC):
 
     .. automethod:: get_ndirect_sources_per_target_box
     """
+
     def __init__(
             self,
             translation_cost_model_factory=make_pde_aware_translation_cost_model):
@@ -229,28 +229,27 @@ def __init__(
         self.translation_cost_model_factory = translation_cost_model_factory
 
     @abstractmethod
-    def process_form_multipoles(self, queue, traversal, p2m_cost):
+    def process_form_multipoles(self, actx: PyOpenCLArrayContext,
+                                traversal, p2m_cost):
         """Cost for forming multipole expansions of each box.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg p2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels,) representing the cost of forming the multipole
-            expansion of one source at each level.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (nsource_boxes,), with each entry represents the cost of the box.
+        :arg p2m_cost: an array of shape (nlevels,) representing the cost of
+            forming the multipole expansion of one source at each level.
+        :return: an array of shape (nsource_boxes,), with each entry represents
+            the cost of the box.
         """
         pass
 
     @abstractmethod
-    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+    def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+                                   traversal, m2m_cost):
         """Cost for upward propagation.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg m2m_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels-1,), where the ith entry represents the
-            multipole-to-multipole cost from source level i+1 to target level i.
+        :arg m2m_cost: an array of shape (nlevels-1,), where the ith entry
+            represents the multipole-to-multipole cost from source level i+1
+            to target level i.
         :return: a :class:`float`, the overall cost of upward propagation.
 
         .. note:: This method returns a number instead of an array, because it is not
@@ -260,118 +259,106 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
         pass
 
     @abstractmethod
-    def get_ndirect_sources_per_target_box(self, queue, traversal):
+    def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
+                                           traversal):
         """Collect the number of direct evaluation sources (list 1, list 3 close and
         list 4 close) for each target box.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (ntarget_boxes,), with each entry representing the number of direct
-            evaluation sources for that target box.
+        :return: an array of shape (ntarget_boxes,), with each entry representing
+            the number of direct evaluation sources for that target box.
         """
         pass
 
     @abstractmethod
-    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+    def process_direct(self, actx: PyOpenCLArrayContext,
+                       traversal, ndirect_sources_by_itgt_box, p2p_cost,
                        box_target_counts_nonchild=None):
         """Direct evaluation cost of each target box of *traversal*.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg ndirect_sources_by_itgt_box: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), with each entry
-            representing the number of direct evaluation sources for that target box.
+        :arg ndirect_sources_by_itgt_box: an array of shape (ntarget_boxes,),
+            with each entry representing the number of direct evaluation sources
+            for that target box.
         :arg p2p_cost: a constant representing the cost of one point-to-point
             evaluation.
-        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
-            using direct evaluation in this box. For example, this is useful in QBX
-            by specifying the number of non-QBX targets. If None, all targets in
-            boxes are considered.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (ntarget_boxes,), with each entry represents the cost of the box.
+        :arg box_target_counts_nonchild: an array of shape (nboxes,), the
+            number of targets using direct evaluation in this box. For example,
+            this is useful in QBX by specifying the number of non-QBX targets.
+            If None, all targets in boxes are considered.
+        :return: an array of shape (ntarget_boxes,), with each entry represents
+            the cost of the box.
         """
         pass
 
     @abstractmethod
-    def process_list2(self, queue, traversal, m2l_cost):
+    def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg m2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels,) representing the translation cost of each level.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (ntarget_or_target_parent_boxes,), with each entry representing the cost
-            of multipole-to-local translations to this box.
+        :arg m2l_cost: an array of shape (nlevels,) representing the
+            translation cost of each level.
+        :return: an array of shape (ntarget_or_target_parent_boxes,), with
+            each entry representing the cost of multipole-to-local
+            translations to this box.
         """
         pass
 
     @abstractmethod
-    def process_list3(self, queue, traversal, m2p_cost,
+    def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
                       box_target_counts_nonchild=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg m2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels,) where the ith entry represents the evaluation cost
-            from multipole expansion at level i to a point.
-        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
-            using multiple-to-point translations in this box. For example, this is
-            useful in QBX by specifying the number of non-QBX targets. If None, all
-            targets in boxes are considered.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (nboxes,), with each entry representing the cost of evaluating all
-            targets inside this box from multipole expansions of list-3 boxes.
+        :arg m2p_cost: an array of shape (nlevels,) where the ith entry
+            represents the evaluation cost from multipole expansion at level i
+            to a point.
+        :arg box_target_counts_nonchild: an array of shape (nboxes,), the
+            number of targets using multiple-to-point translations in this box.
+            For example, this is useful in QBX by specifying the number of
+            non-QBX targets. If None, all targets in boxes are considered.
+        :return: an array of shape (nboxes,), with each entry representing the
+            cost of evaluating all targets inside this box from multipole
+            expansions of list-3 boxes.
         """
         pass
 
     @abstractmethod
-    def process_list4(self, queue, traversal, p2l_cost):
+    def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg p2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels,) where the ith entry represents the translation cost
-            from a point to the local expansion at level i.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (ntarget_or_target_parent_boxes,), with each entry representing the cost
-            of point-to-local translations to this box.
+        :arg p2l_cost: an array of shape (nlevels,) where the ith entry
+            represents the translation cost from a point to the local expansion
+            at level i.
+        :return: an array of shape (ntarget_or_target_parent_boxes,), with
+            each entry representing the cost of point-to-local translations to
+            this box.
         """
         pass
 
     @abstractmethod
-    def process_eval_locals(self, queue, traversal, l2p_cost,
+    def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
                             box_target_counts_nonchild=None):
         """
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg l2p_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape (nlevels,) where the ith entry represents the cost of evaluating
-            the potential of a target in a box of level i using the box's local
-            expansion.
-        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
-            which need evaluation. For example, this is useful in QBX by specifying
-            the number of non-QBX targets. If None, use
+        :arg l2p_cost: an array of shape (nlevels,) where the ith entry
+            represents the cost of evaluating the potential of a target in a
+            box of level i using the box's local expansion.
+        :arg box_target_counts_nonchild: an array of shape (nboxes,), the number
+            of targets which need evaluation. For example, this is useful in
+            QBX by specifying the number of non-QBX targets. If None, use
             traversal.tree.box_target_counts_nonchild.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (ntarget_boxes,), the cost of evaluating the potentials of all targets
-            inside this box from its local expansion.
+        :return: an array of shape (ntarget_boxes,), the cost of evaluating the
+            potentials of all targets inside this box from its local expansion.
         """
         pass
 
     @abstractmethod
-    def process_refine_locals(self, queue, traversal, l2l_cost):
+    def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost):
         """Cost of downward propagation.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
-        :arg l2l_cost: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array`
-            of shape ``(nlevels-1,)``, where the :math:`i`th entry represents
-            the cost of translating local expansion from level :math:`i` to
-            level :math:`i+1`.
+        :arg l2l_cost: an array of shape ``(nlevels-1,)``, where the :math:`i`th
+            entry represents the cost of translating local expansion from level
+            :math:`i` to level :math:`i+1`.
         :return: a :class:`float`, the overall cost of downward propagation.
 
         .. note:: This method returns a number instead of an array, because it is not
@@ -381,36 +368,34 @@ def process_refine_locals(self, queue, traversal, l2l_cost):
         pass
 
     @abstractmethod
-    def aggregate_over_boxes(self, per_box_result):
+    def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result):
         """Sum all entries of *per_box_result* into a number.
 
-        :arg per_box_result: an object of :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array`, the result to be sumed.
+        :arg per_box_result: an array to be sumed.
         :return: a :class:`float`, the result of the sum.
         """
         pass
 
     @staticmethod
-    def cost_factors_to_dev(cost_factors, queue):
+    def cost_factors_to_dev(cost_factors, actx: PyOpenCLArrayContext | None):
         cost_factors_dev = {}
 
         for name in cost_factors:
             if not isinstance(cost_factors[name], np.ndarray):
                 cost_factors_dev[name] = cost_factors[name]
                 continue
-            cost_factors_dev[name] = cl.array.to_device(
-                queue, cost_factors[name]
-            ).with_queue(None)
+
+            cost_factors_dev[name] = actx.freeze(actx.from_numpy(cost_factors[name]))
 
         return cost_factors_dev
 
     def fmm_cost_factors_for_kernels_from_model(
-            self, queue, nlevels, xlat_cost, context):
+            self, actx: PyOpenCLArrayContext | None, nlevels, xlat_cost, context):
         """Evaluate translation cost factors from symbolic model. The result of this
         function can be used for process_* methods in this class.
 
-        :arg queue: If not None, the cost factor arrays will be transferred to device
-            using this queue.
+        :arg actx: If not None, the cost factor arrays will be converted to
+            they array context's array type.
         :arg nlevels: the number of tree levels.
         :arg xlat_cost: a :class:`FMMTranslationCostModel`.
         :arg context: a :class:`dict` of parameters passed as context when
@@ -449,29 +434,26 @@ def fmm_cost_factors_for_kernels_from_model(
             ], dtype=np.float64)
         }
 
-        if queue:
-            cost_factors = self.cost_factors_to_dev(cost_factors, queue)
+        if actx:
+            cost_factors = self.cost_factors_to_dev(cost_factors, actx)
 
         return cost_factors
 
     @abstractmethod
-    def zero_cost_per_box(self, queue, nboxes):
+    def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes):
         """Helper function for returning the per-box cost filled with 0.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :param nboxes: the number of boxes
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (*nboxes*,), representing the zero per-box cost.
+        :return: an array of shape (*nboxes*,), representing the zero per-box cost.
         """
         pass
 
-    def cost_per_box(self, queue, traversal, level_to_order,
+    def cost_per_box(self, actx: PyOpenCLArrayContext, traversal, level_to_order,
                      calibration_params,
                      ndirect_sources_per_target_box=None,
                      box_target_counts_nonchild=None):
         """Predict the per-box costs of a new traversal object.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
         :arg level_to_order: a :class:`numpy.ndarray` of shape
             (traversal.tree.nlevels,) representing the expansion orders
@@ -479,24 +461,21 @@ def cost_per_box(self, queue, traversal, level_to_order,
         :arg calibration_params: a :class:`dict` of calibration parameters. These
             parameters can be obtained via :meth:`estimate_calibration_params`
             or :meth:`get_unit_calibration_params`.
-        :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
-            direct evaluation sources (list 1, list 3 close, list 4 close) for each
-            target box. You may find :meth:`get_ndirect_sources_per_target_box`
-            helpful. This argument is useful because the same result can be reused
-            for p2p, p2qbxl and tsqbx.
-        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
-            which need evaluation. For example, this is useful in QBX by specifying
-            the number of non-QBX targets. If None, all targets are considered,
-            namely traversal.tree.box_target_counts_nonchild.
-        :return: a :class:`numpy.ndarray` or :class:`pyopencl.array.Array` of shape
-            (nboxes,), where the ith entry represents the cost of all stages for box
-            i.
+        :arg ndirect_sources_per_target_box: an array of shape (ntarget_boxes,),
+            the number of direct evaluation sources (list 1, list 3 close, list
+            4 close) for each target box. You may find
+            :meth:`get_ndirect_sources_per_target_box` helpful. This argument is
+            useful because the same result can be reused for p2p, p2qbxl and tsqbx.
+        :arg box_target_counts_nonchild: an array of shape (nboxes,), the number
+            of targets which need evaluation. For example, this is useful in
+            QBX by specifying the number of non-QBX targets. If None, all
+            targets are considered, namely traversal.tree.box_target_counts_nonchild.
+        :return: an array of shape (nboxes,), where the ith entry represents
+            the cost of all stages for box i.
         """
         if ndirect_sources_per_target_box is None:
             ndirect_sources_per_target_box = (
-                self.get_ndirect_sources_per_target_box(queue, traversal)
+                self.get_ndirect_sources_per_target_box(actx, traversal)
             )
 
         tree = traversal.tree
@@ -505,7 +484,7 @@ def cost_per_box(self, queue, traversal, level_to_order,
         target_boxes = traversal.target_boxes
         target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
 
-        result = self.zero_cost_per_box(queue, nboxes)
+        result = self.zero_cost_per_box(actx, nboxes)
 
         for ilevel in range(tree.nlevels):
             calibration_params[f"p_fmm_lev{ilevel}"] = level_to_order[ilevel]
@@ -515,49 +494,48 @@ def cost_per_box(self, queue, traversal, level_to_order,
         )
 
         translation_cost = self.fmm_cost_factors_for_kernels_from_model(
-            queue, tree.nlevels, xlat_cost, calibration_params
+            actx, tree.nlevels, xlat_cost, calibration_params
         )
 
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
 
         result[source_boxes] += self.process_form_multipoles(
-            queue, traversal, translation_cost["p2m_cost"]
+            actx, traversal, translation_cost["p2m_cost"]
         )
 
         result[target_boxes] += self.process_direct(
-            queue, traversal, ndirect_sources_per_target_box,
+            actx, traversal, ndirect_sources_per_target_box,
             translation_cost["c_p2p"],
             box_target_counts_nonchild=box_target_counts_nonchild
         )
 
         result[target_or_target_parent_boxes] += self.process_list2(
-            queue, traversal, translation_cost["m2l_cost"]
+            actx, traversal, translation_cost["m2l_cost"]
         )
 
         result += self.process_list3(
-            queue, traversal, translation_cost["m2p_cost"],
+            actx, traversal, translation_cost["m2p_cost"],
             box_target_counts_nonchild=box_target_counts_nonchild
         )
 
         result[target_or_target_parent_boxes] += self.process_list4(
-            queue, traversal, translation_cost["p2l_cost"]
+            actx, traversal, translation_cost["p2l_cost"]
         )
 
         result[target_boxes] += self.process_eval_locals(
-            queue, traversal, translation_cost["l2p_cost"],
+            actx, traversal, translation_cost["l2p_cost"],
             box_target_counts_nonchild=box_target_counts_nonchild
         )
 
         return result
 
-    def cost_per_stage(self, queue, traversal, level_to_order,
+    def cost_per_stage(self, actx: PyOpenCLArrayContext, traversal, level_to_order,
                        calibration_params,
                        ndirect_sources_per_target_box=None,
                        box_target_counts_nonchild=None):
         """Predict the per-stage costs of a new traversal object.
 
-        :arg queue: a :class:`pyopencl.CommandQueue` object.
         :arg traversal: a :class:`boxtree.traversal.FMMTraversalInfo` object.
         :arg level_to_order: a :class:`numpy.ndarray` of shape
             (traversal.tree.nlevels,) representing the expansion orders
@@ -565,22 +543,21 @@ def cost_per_stage(self, queue, traversal, level_to_order,
         :arg calibration_params: a :class:`dict` of calibration parameters. These
             parameters can be obtained via :meth:`estimate_calibration_params`
             or :meth:`get_unit_calibration_params`.
-        :arg ndirect_sources_per_target_box: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (ntarget_boxes,), the number of
-            direct evaluation sources (list 1, list 3 close, list 4 close) for each
-            target box. You may find :func:`get_ndirect_sources_per_target_box`
-            helpful. This argument is useful because the same result can be reused
-            for p2p, p2qbxl and tsqbx.
-        :arg box_target_counts_nonchild: a :class:`numpy.ndarray` or
-            :class:`pyopencl.array.Array` of shape (nboxes,), the number of targets
-            which need evaluation. For example, this is useful in QBX by specifying
-            the number of non-QBX targets. If None, all targets are considered,
-            namely traversal.tree.box_target_counts_nonchild.
+        :arg ndirect_sources_per_target_box: an array of shape (ntarget_boxes,),
+            the number of direct evaluation sources (list 1, list 3 close, list
+            4 close) for each target box. You may find
+            :func:`get_ndirect_sources_per_target_box` helpful. This argument
+            is useful because the same result can be reused for p2p, p2qbxl and
+            tsqbx.
+        :arg box_target_counts_nonchild: an array of shape (nboxes,), the
+            number of targets which need evaluation. For example, this is useful
+            in QBX by specifying the number of non-QBX targets. If None, all
+            targets are considered, namely traversal.tree.box_target_counts_nonchild.
         :return: a :class:`dict`, mapping FMM stage names to cost numbers.
         """
         if ndirect_sources_per_target_box is None:
             ndirect_sources_per_target_box = (
-                self.get_ndirect_sources_per_target_box(queue, traversal)
+                self.get_ndirect_sources_per_target_box(actx, traversal)
             )
 
         tree = traversal.tree
@@ -594,52 +571,58 @@ def cost_per_stage(self, queue, traversal, level_to_order,
         )
 
         translation_cost = self.fmm_cost_factors_for_kernels_from_model(
-            queue, tree.nlevels, xlat_cost, calibration_params
+            actx, tree.nlevels, xlat_cost, calibration_params
         )
 
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
 
         result["form_multipoles"] = self.aggregate_over_boxes(
+            actx,
             self.process_form_multipoles(
-                queue, traversal, translation_cost["p2m_cost"]
+                actx, traversal, translation_cost["p2m_cost"]
             )
         )
 
         result["coarsen_multipoles"] = self.process_coarsen_multipoles(
-            queue, traversal, translation_cost["m2m_cost"]
+            actx, traversal, translation_cost["m2m_cost"]
         )
 
         result["eval_direct"] = self.aggregate_over_boxes(
+            actx,
             self.process_direct(
-                queue, traversal, ndirect_sources_per_target_box,
+                actx, traversal, ndirect_sources_per_target_box,
                 translation_cost["c_p2p"],
                 box_target_counts_nonchild=box_target_counts_nonchild
             )
         )
 
         result["multipole_to_local"] = self.aggregate_over_boxes(
-            self.process_list2(queue, traversal, translation_cost["m2l_cost"])
+            actx,
+            self.process_list2(actx, traversal, translation_cost["m2l_cost"])
         )
 
         result["eval_multipoles"] = self.aggregate_over_boxes(
+            actx,
             self.process_list3(
-                queue, traversal, translation_cost["m2p_cost"],
+                actx, traversal, translation_cost["m2p_cost"],
                 box_target_counts_nonchild=box_target_counts_nonchild
             )
         )
 
         result["form_locals"] = self.aggregate_over_boxes(
-            self.process_list4(queue, traversal, translation_cost["p2l_cost"])
+            actx,
+            self.process_list4(actx, traversal, translation_cost["p2l_cost"])
         )
 
         result["refine_locals"] = self.process_refine_locals(
-            queue, traversal, translation_cost["l2l_cost"]
+            actx, traversal, translation_cost["l2l_cost"]
         )
 
         result["eval_locals"] = self.aggregate_over_boxes(
+            actx,
             self.process_eval_locals(
-                queue, traversal, translation_cost["l2p_cost"],
+                actx, traversal, translation_cost["l2p_cost"],
                 box_target_counts_nonchild=box_target_counts_nonchild
             )
         )
@@ -744,11 +727,12 @@ class FMMCostModel(AbstractFMMCostModel):
 
     # {{{ form multipoles
 
-    @memoize_method
-    def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype,
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_form_multipoles_knl(self, actx: PyOpenCLArrayContext,
+                                    box_id_dtype, particle_id_dtype,
                                     box_level_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 double *np2m,
                 ${box_id_t} *source_boxes,
@@ -773,13 +757,12 @@ def process_form_multipoles_knl(self, context, box_id_dtype, particle_id_dtype,
             name="process_form_multipoles"
         )
 
-    def process_form_multipoles(self, queue, traversal, p2m_cost):
+    def process_form_multipoles(self, actx, traversal, p2m_cost):
         tree = traversal.tree
-        np2m = cl.array.zeros(queue, len(traversal.source_boxes), dtype=np.float64)
+        np2m = actx.zeros(len(traversal.source_boxes), dtype=np.float64)
 
         process_form_multipoles_knl = self.process_form_multipoles_knl(
-            queue.context,
-            tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
+            actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
 
         process_form_multipoles_knl(
@@ -787,7 +770,8 @@ def process_form_multipoles(self, queue, traversal, p2m_cost):
             traversal.source_boxes,
             tree.box_source_counts_nonchild,
             tree.box_levels,
-            p2m_cost
+            p2m_cost,
+            queue=actx.queue,
         )
 
         return np2m
@@ -796,11 +780,12 @@ def process_form_multipoles(self, queue, traversal, p2m_cost):
 
     # {{{ propagate multipoles upward
 
-    @memoize_method
-    def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype,
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_coarsen_multipoles_knl(self, actx: PyOpenCLArrayContext,
+                                       ndimensions, box_id_dtype,
                                        box_level_dtype, nlevels):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 ${box_id_t} *source_parent_boxes,
                 ${box_level_t} *box_levels,
@@ -840,14 +825,13 @@ def process_coarsen_multipoles_knl(self, context, ndimensions, box_id_dtype,
             name="process_coarsen_multipoles"
         )
 
-    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+    def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+                                   traversal, m2m_cost):
         tree = traversal.tree
-        nm2m = cl.array.zeros(
-            queue, len(traversal.source_parent_boxes), dtype=np.float64
-        )
+        nm2m = actx.zeros(len(traversal.source_parent_boxes), dtype=np.float64)
 
         process_coarsen_multipoles_knl = self.process_coarsen_multipoles_knl(
-            queue.context,
+            actx,
             tree.dimensions, tree.box_id_dtype, tree.box_level_dtype, tree.nlevels
         )
 
@@ -857,19 +841,20 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
             m2m_cost,
             nm2m,
             *tree.box_child_ids,
-            queue=queue
+            queue=actx.queue
         )
 
-        return self.aggregate_over_boxes(nm2m)
+        return self.aggregate_over_boxes(actx, nm2m)
 
     # }}}
 
     # {{{ direct evaluation to point targets (lists 1, 3 close, 4 close)
 
-    @memoize_method
-    def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype):
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def _get_ndirect_sources_knl(self, actx: PyOpenCLArrayContext,
+                                 particle_id_dtype, box_id_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template("""
                 ${particle_id_t} *ndirect_sources_by_itgt_box,
                 ${box_id_t} *source_boxes_starts,
@@ -902,18 +887,19 @@ def _get_ndirect_sources_knl(self, context, particle_id_dtype, box_id_dtype):
             name="get_ndirect_sources"
         )
 
-    def get_ndirect_sources_per_target_box(self, queue, traversal):
+    def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
+                                           traversal):
         tree = traversal.tree
         ntarget_boxes = len(traversal.target_boxes)
         particle_id_dtype = tree.particle_id_dtype
         box_id_dtype = tree.box_id_dtype
 
         get_ndirect_sources_knl = self._get_ndirect_sources_knl(
-            queue.context, particle_id_dtype, box_id_dtype
+            actx, particle_id_dtype, box_id_dtype
         )
 
-        ndirect_sources_by_itgt_box = cl.array.zeros(
-            queue, ntarget_boxes, dtype=particle_id_dtype
+        ndirect_sources_by_itgt_box = actx.zeros(
+            ntarget_boxes, dtype=particle_id_dtype
         )
 
         # List 1
@@ -926,7 +912,7 @@ def get_ndirect_sources_per_target_box(self, queue, traversal):
 
         # List 3 close
         if traversal.from_sep_close_smaller_starts is not None:
-            queue.finish()
+            actx.queue.finish()
             get_ndirect_sources_knl(
                 ndirect_sources_by_itgt_box,
                 traversal.from_sep_close_smaller_starts,
@@ -936,7 +922,7 @@ def get_ndirect_sources_per_target_box(self, queue, traversal):
 
         # List 4 close
         if traversal.from_sep_close_bigger_starts is not None:
-            queue.finish()
+            actx.queue.finish()
             get_ndirect_sources_knl(
                 ndirect_sources_by_itgt_box,
                 traversal.from_sep_close_bigger_starts,
@@ -946,28 +932,26 @@ def get_ndirect_sources_per_target_box(self, queue, traversal):
 
         return ndirect_sources_by_itgt_box
 
-    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+    def process_direct(self, actx: PyOpenCLArrayContext,
+                       traversal, ndirect_sources_by_itgt_box, p2p_cost,
                        box_target_counts_nonchild=None):
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
 
-        from pyopencl.array import take
-        ntargets_by_itgt_box = take(
-            box_target_counts_nonchild,
-            traversal.target_boxes,
-            queue=queue
-        )
-
+        ntargets_by_itgt_box = (
+            actx.thaw(box_target_counts_nonchild)[traversal.target_boxes]
+            )
         return ndirect_sources_by_itgt_box * ntargets_by_itgt_box * p2p_cost
 
     # }}}
 
     # {{{ translate separated siblings' ("list 2") mpoles to local
 
-    @memoize_method
-    def process_list2_knl(self, context, box_id_dtype, box_level_dtype):
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_list2_knl(self, actx: PyOpenCLArrayContext,
+                          box_id_dtype, box_level_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 double *nm2l,
                 ${box_id_t} *target_or_target_parent_boxes,
@@ -991,25 +975,24 @@ def process_list2_knl(self, context, box_id_dtype, box_level_dtype):
             name="process_list2"
         )
 
-    def process_list2(self, queue, traversal, m2l_cost):
+    def process_list2(self, actx, traversal, m2l_cost):
         tree = traversal.tree
         box_id_dtype = tree.box_id_dtype
         box_level_dtype = tree.box_level_dtype
 
         ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
-        nm2l = cl.array.zeros(
-            queue, (ntarget_or_target_parent_boxes,), dtype=np.float64
-        )
+        nm2l = actx.zeros((ntarget_or_target_parent_boxes,), dtype=np.float64)
 
         process_list2_knl = self.process_list2_knl(
-            queue.context, box_id_dtype, box_level_dtype
+            actx, box_id_dtype, box_level_dtype
         )
         process_list2_knl(
             nm2l,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_siblings_starts,
             tree.box_levels,
-            m2l_cost
+            m2l_cost,
+            queue=actx.queue,
         )
 
         return nm2l
@@ -1018,10 +1001,11 @@ def process_list2(self, queue, traversal, m2l_cost):
 
     # {{{ evaluate sep. smaller mpoles ("list 3") at particles
 
-    @memoize_method
-    def process_list3_knl(self, context, box_id_dtype, particle_id_dtype):
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_list3_knl(self, actx: PyOpenCLArrayContext,
+                          box_id_dtype, particle_id_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 ${box_id_t} *target_boxes_sep_smaller,
                 ${box_id_t} *sep_smaller_start,
@@ -1047,16 +1031,16 @@ def process_list3_knl(self, context, box_id_dtype, particle_id_dtype):
             name="process_list3"
         )
 
-    def process_list3(self, queue, traversal, m2p_cost,
+    def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
                       box_target_counts_nonchild=None):
         tree = traversal.tree
-        nm2p = cl.array.zeros(queue, tree.nboxes, dtype=np.float64)
+        nm2p = actx.zeros(tree.nboxes, dtype=np.float64)
 
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = tree.box_target_counts_nonchild
 
         process_list3_knl = self.process_list3_knl(
-            queue.context, tree.box_id_dtype, tree.particle_id_dtype
+            actx, tree.box_id_dtype, tree.particle_id_dtype
         )
 
         for ilevel, sep_smaller_list in enumerate(
@@ -1065,9 +1049,9 @@ def process_list3(self, queue, traversal, m2p_cost,
                 traversal.target_boxes_sep_smaller_by_source_level[ilevel],
                 sep_smaller_list.starts,
                 box_target_counts_nonchild,
-                m2p_cost[ilevel].get(queue=queue).reshape(-1)[0],
+                actx.to_numpy(m2p_cost[ilevel]).reshape(-1)[0],
                 nm2p,
-                queue=queue
+                queue=actx.queue
             )
 
         return nm2p
@@ -1076,11 +1060,11 @@ def process_list3(self, queue, traversal, m2p_cost,
 
     # {{{ form locals for separated bigger source boxes ("list 4")
 
-    @memoize_method
-    def process_list4_knl(self, context,
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_list4_knl(self, actx: PyOpenCLArrayContext,
                           box_id_dtype, particle_id_dtype, box_level_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 double *nm2p,
                 ${box_id_t} *from_sep_bigger_starts,
@@ -1110,15 +1094,13 @@ def process_list4_knl(self, context,
             name="process_list4"
         )
 
-    def process_list4(self, queue, traversal, p2l_cost):
+    def process_list4(self, actx, traversal, p2l_cost):
         tree = traversal.tree
         target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
-        nm2p = cl.array.zeros(
-            queue, len(target_or_target_parent_boxes), dtype=np.float64
-        )
+        nm2p = actx.zeros(len(target_or_target_parent_boxes), dtype=np.float64)
 
         process_list4_knl = self.process_list4_knl(
-            queue.context,
+            actx,
             tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
 
@@ -1128,7 +1110,8 @@ def process_list4(self, queue, traversal, p2l_cost):
             traversal.from_sep_bigger_lists,
             tree.box_source_counts_nonchild,
             tree.box_levels,
-            p2l_cost
+            p2l_cost,
+            queue=actx.queue
         )
 
         return nm2p
@@ -1137,11 +1120,11 @@ def process_list4(self, queue, traversal, p2l_cost):
 
     # {{{ evaluate local expansions at targets
 
-    @memoize_method
-    def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype,
-                                box_level_dtype):
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_eval_locals_knl(self, actx: PyOpenCLArrayContext,
+                                box_id_dtype, particle_id_dtype, box_level_dtype):
         return ElementwiseKernel(
-            context,
+            actx.context,
             Template(r"""
                 double *neval_locals,
                 ${box_id_t} *target_boxes,
@@ -1166,18 +1149,17 @@ def process_eval_locals_knl(self, context, box_id_dtype, particle_id_dtype,
             name="process_eval_locals"
         )
 
-    def process_eval_locals(self, queue, traversal, l2p_cost,
+    def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
                             box_target_counts_nonchild=None):
         tree = traversal.tree
         ntarget_boxes = len(traversal.target_boxes)
-        neval_locals = cl.array.zeros(queue, ntarget_boxes, dtype=np.float64)
+        neval_locals = actx.zeros(ntarget_boxes, dtype=np.float64)
 
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
 
         process_eval_locals_knl = self.process_eval_locals_knl(
-            queue.context,
-            tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
+            actx, tree.box_id_dtype, tree.particle_id_dtype, tree.box_level_dtype
         )
 
         process_eval_locals_knl(
@@ -1194,11 +1176,11 @@ def process_eval_locals(self, queue, traversal, l2p_cost,
 
     # {{{ propagate locals downward
 
-    @memoize_method
-    def process_refine_locals_knl(self, context, box_id_dtype):
+    @keyed_memoize_method(key=lambda *args: (type(args[0]), args[1:]))
+    def process_refine_locals_knl(self, actx: PyOpenCLArrayContext, box_id_dtype):
         from pyopencl.reduction import ReductionKernel
         return ReductionKernel(
-            context,
+            actx.context,
             np.float64,
             neutral="0.0",
             reduce_expr="a+b",
@@ -1216,43 +1198,40 @@ def process_refine_locals_knl(self, context, box_id_dtype):
             name="process_refine_locals"
         )
 
-    def process_refine_locals(self, queue, traversal, l2l_cost):
+    def process_refine_locals(self, actx: PyOpenCLArrayContext,
+                              traversal, l2l_cost):
         tree = traversal.tree
         process_refine_locals_knl = self.process_refine_locals_knl(
-            queue.context, tree.box_id_dtype
+            actx, tree.box_id_dtype
         )
 
-        level_start_target_or_target_parent_box_nrs = cl.array.to_device(
-            queue, traversal.level_start_target_or_target_parent_box_nrs
+        level_start_target_or_target_parent_box_nrs = actx.thaw(
+            traversal.level_start_target_or_target_parent_box_nrs
         )
 
         cost = process_refine_locals_knl(
             level_start_target_or_target_parent_box_nrs,
             l2l_cost,
             range=slice(1, tree.nlevels)
-        ).get()
+        )
 
-        return cost.reshape(-1)[0]
+        return actx.to_numpy(cost).reshape(-1)[0]
 
     # }}}
 
-    def zero_cost_per_box(self, queue, nboxes):
-        return cl.array.zeros(queue, (nboxes,), dtype=np.float64)
+    def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes):
+        return actx.zeros((nboxes,), dtype=np.float64)
 
-    def aggregate_over_boxes(self, per_box_result):
+    def aggregate_over_boxes(self, actx: PyOpenCLArrayContext, per_box_result):
         if isinstance(per_box_result, float):
             return per_box_result
         else:
-            return cl.array.sum(per_box_result).get().reshape(-1)[0]
+            return actx.to_numpy(actx.np.sum(per_box_result)).item()
 
     def fmm_cost_factors_for_kernels_from_model(
-            self, queue, nlevels, xlat_cost, context):
-        if not isinstance(queue, cl.CommandQueue):
-            raise TypeError(
-                "An OpenCL command queue must be supplied for cost model")
-
+            self, actx: PyOpenCLArrayContext, nlevels, xlat_cost, context):
         return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
-            self, queue, nlevels, xlat_cost, context
+            self, actx, nlevels, xlat_cost, context
         )
 
 # }}}
@@ -1261,7 +1240,8 @@ def fmm_cost_factors_for_kernels_from_model(
 # {{{ _PythonFMMCostModel (undocumented, only used for testing)
 
 class _PythonFMMCostModel(AbstractFMMCostModel):
-    def process_form_multipoles(self, queue, traversal, p2m_cost):
+    def process_form_multipoles(self, actx: PyOpenCLArrayContext,
+                                traversal, p2m_cost):
         tree = traversal.tree
         np2m = np.zeros(len(traversal.source_boxes), dtype=np.float64)
 
@@ -1274,7 +1254,8 @@ def process_form_multipoles(self, queue, traversal, p2m_cost):
 
         return np2m
 
-    def get_ndirect_sources_per_target_box(self, queue, traversal):
+    def get_ndirect_sources_per_target_box(self, actx: PyOpenCLArrayContext,
+                                           traversal):
         tree = traversal.tree
         ntarget_boxes = len(traversal.target_boxes)
 
@@ -1308,7 +1289,8 @@ def get_ndirect_sources_per_target_box(self, queue, traversal):
 
         return ndirect_sources_by_itgt_box
 
-    def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost,
+    def process_direct(self, actx: PyOpenCLArrayContext,
+                       traversal, ndirect_sources_by_itgt_box, p2p_cost,
                        box_target_counts_nonchild=None):
         if box_target_counts_nonchild is None:
             box_target_counts_nonchild = traversal.tree.box_target_counts_nonchild
@@ -1317,7 +1299,7 @@ def process_direct(self, queue, traversal, ndirect_sources_by_itgt_box, p2p_cost
 
         return ntargets_by_itgt_box * ndirect_sources_by_itgt_box * p2p_cost
 
-    def process_list2(self, queue, traversal, m2l_cost):
+    def process_list2(self, actx: PyOpenCLArrayContext, traversal, m2l_cost):
         tree = traversal.tree
         ntarget_or_target_parent_boxes = len(traversal.target_or_target_parent_boxes)
         nm2l = np.zeros(ntarget_or_target_parent_boxes, dtype=np.float64)
@@ -1330,7 +1312,7 @@ def process_list2(self, queue, traversal, m2l_cost):
 
         return nm2l
 
-    def process_list3(self, queue, traversal, m2p_cost,
+    def process_list3(self, actx: PyOpenCLArrayContext, traversal, m2p_cost,
                       box_target_counts_nonchild=None):
         tree = traversal.tree
         nm2p = np.zeros(tree.nboxes, dtype=np.float64)
@@ -1348,7 +1330,7 @@ def process_list3(self, queue, traversal, m2p_cost,
 
         return nm2p
 
-    def process_list4(self, queue, traversal, p2l_cost):
+    def process_list4(self, actx: PyOpenCLArrayContext, traversal, p2l_cost):
         tree = traversal.tree
         target_or_target_parent_boxes = traversal.target_or_target_parent_boxes
         nm2p = np.zeros(len(target_or_target_parent_boxes), dtype=np.float64)
@@ -1362,7 +1344,7 @@ def process_list4(self, queue, traversal, p2l_cost):
 
         return nm2p
 
-    def process_eval_locals(self, queue, traversal, l2p_cost,
+    def process_eval_locals(self, actx: PyOpenCLArrayContext, traversal, l2p_cost,
                             box_target_counts_nonchild=None):
         tree = traversal.tree
         ntarget_boxes = len(traversal.target_boxes)
@@ -1380,7 +1362,8 @@ def process_eval_locals(self, queue, traversal, l2p_cost,
 
         return neval_locals
 
-    def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
+    def process_coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+                                   traversal, m2m_cost):
         tree = traversal.tree
         result = 0.0
 
@@ -1406,7 +1389,7 @@ def process_coarsen_multipoles(self, queue, traversal, m2m_cost):
 
         return result
 
-    def process_refine_locals(self, queue, traversal, l2l_cost):
+    def process_refine_locals(self, actx: PyOpenCLArrayContext, traversal, l2l_cost):
         tree = traversal.tree
         result = 0.0
 
@@ -1418,17 +1401,17 @@ def process_refine_locals(self, queue, traversal, l2l_cost):
 
         return result
 
-    def zero_cost_per_box(self, queue, nboxes):
+    def zero_cost_per_box(self, actx: PyOpenCLArrayContext, nboxes):
         return np.zeros(nboxes, dtype=np.float64)
 
-    def aggregate_over_boxes(self, per_box_result):
+    def aggregate_over_boxes(self, actx, per_box_result):
         if isinstance(per_box_result, float):
             return per_box_result
         else:
             return np.sum(per_box_result)
 
     def fmm_cost_factors_for_kernels_from_model(
-            self, queue, nlevels, xlat_cost, context):
+            self, actx: PyOpenCLArrayContext, nlevels, xlat_cost, context):
         return AbstractFMMCostModel.fmm_cost_factors_for_kernels_from_model(
             self, None, nlevels, xlat_cost, context
         )

From 003d0e0e16c3dbf4af084dff178f0f2327a13c91 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sun, 26 Jun 2022 18:39:47 +0300
Subject: [PATCH 14/28] port distributed to arraycontext

---
 boxtree/distributed/__init__.py        |  48 +++--
 boxtree/distributed/calculation.py     | 247 +++++++++++++------------
 boxtree/distributed/local_traversal.py |  22 +--
 boxtree/distributed/local_tree.py      | 208 +++++++++++----------
 boxtree/distributed/partition.py       | 128 ++++++-------
 boxtree/fmm.py                         |   2 +-
 6 files changed, 336 insertions(+), 319 deletions(-)

diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py
index 495a47a6..73881352 100644
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
@@ -88,7 +88,7 @@
 Distributed Wrangler
 --------------------
 
-.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWrangler
+.. autoclass:: boxtree.distributed.calculation.DistributedExpansionWranglerMixin
 
 .. _distributed-fmm-evaluation:
 
@@ -97,8 +97,8 @@
 
 The distributed version of the FMM evaluation shares the same interface as the
 shared-memory version. To evaluate FMM in a distributed manner, use a subclass
-of :class:`boxtree.distributed.calculation.DistributedExpansionWrangler` in
-:func:`boxtree.fmm.drive_fmm`.
+of :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`
+in :func:`boxtree.fmm.drive_fmm`.
 
 """
 
@@ -108,9 +108,7 @@
 import numpy as np
 from mpi4py import MPI
 
-import pyopencl as cl
-import pyopencl.array
-
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.cost import FMMCostModel
 
 
@@ -128,9 +126,10 @@ class MPITags(enum.IntEnum):
 
 
 def dtype_to_mpi(dtype):
-    """ This function translates a numpy datatype into the corresponding type used in
+    """This function translates a numpy datatype into the corresponding type used in
     mpi4py.
     """
+
     if hasattr(MPI, "_typedict"):
         typedict = MPI._typedict
     elif hasattr(MPI, "__TypeDict__"):
@@ -151,7 +150,7 @@ def dtype_to_mpi(dtype):
 # {{{ DistributedFMMRunner
 
 def make_distributed_wrangler(
-        queue, global_tree, traversal_builder, wrangler_factory,
+        actx: PyOpenCLArrayContext, global_tree, traversal_builder, wrangler_factory,
         calibration_params, comm):
     """Helper function for constructing the distributed wrangler on each rank.
 
@@ -163,7 +162,6 @@ def make_distributed_wrangler(
         where the wrangler is constructed according to *wrangler_factory* and
         the indices are passed to :func:`boxtree.fmm.drive_fmm`.
     """
-
     mpi_rank = comm.Get_rank()
 
     # `tree_in_device_memory` is True if the global tree is in the device memory
@@ -174,7 +172,7 @@ def make_distributed_wrangler(
     # worker ranks.
     tree_in_device_memory = None
     if mpi_rank == 0:
-        tree_in_device_memory = isinstance(global_tree.targets[0], cl.array.Array)
+        tree_in_device_memory = isinstance(global_tree.targets[0], actx.array_types)
     tree_in_device_memory = comm.bcast(tree_in_device_memory, root=0)
 
     # {{{ Broadcast the global tree
@@ -182,7 +180,7 @@ def make_distributed_wrangler(
     global_tree_host = None
     if mpi_rank == 0:
         if tree_in_device_memory:
-            global_tree_host = global_tree.get(queue)
+            global_tree_host = actx.to_numpy(global_tree)
         else:
             global_tree_host = global_tree
 
@@ -192,11 +190,11 @@ def make_distributed_wrangler(
     if mpi_rank == 0 and tree_in_device_memory:
         global_tree_dev = global_tree
     else:
-        global_tree_dev = global_tree_host.to_device(queue)
-    global_tree_dev = global_tree_dev.with_queue(queue)
+        global_tree_dev = actx.from_numpy(global_tree_host)
+    global_tree_dev = actx.thaw(global_tree_dev)
 
-    global_trav_dev, _ = traversal_builder(queue, global_tree_dev)
-    global_trav_host = global_trav_dev.get(queue)
+    global_trav_dev, _ = traversal_builder(actx, global_tree_dev)
+    global_trav_host = actx.to_numpy(global_trav_dev)
     global_trav = global_trav_dev if tree_in_device_memory else global_trav_host
 
     # }}}
@@ -215,16 +213,16 @@ def make_distributed_wrangler(
             warnings.warn("Calibration parameters for the cost model are not "
                           "supplied. The default one will be used.",
                           stacklevel=2)
-            calibration_params = \
-                FMMCostModel.get_unit_calibration_params()
+            calibration_params = FMMCostModel.get_unit_calibration_params()
 
         # We need to construct a wrangler in order to access `level_orders`
         global_wrangler = wrangler_factory(global_trav, global_trav)
 
         cost_per_box = cost_model.cost_per_box(
-            queue, global_trav_dev, global_wrangler.level_orders,
+            actx, global_trav_dev, global_wrangler.level_orders,
             calibration_params
-        ).get()
+        )
+        cost_per_box = actx.to_numpy(cost_per_box)
 
     from boxtree.distributed.partition import partition_work
     responsible_boxes_list = partition_work(cost_per_box, global_trav_host, comm)
@@ -235,7 +233,7 @@ def make_distributed_wrangler(
 
     from boxtree.distributed.local_tree import generate_local_tree
     local_tree, src_idx, tgt_idx = generate_local_tree(
-        queue, global_trav_host, responsible_boxes_list, comm)
+        actx, global_trav_dev, actx.from_numpy(responsible_boxes_list), comm)
 
     # }}}
 
@@ -249,12 +247,12 @@ def make_distributed_wrangler(
     # {{{ Compute traversal object on each rank
 
     from boxtree.distributed.local_traversal import generate_local_travs
-    local_trav_dev = generate_local_travs(queue, local_tree, traversal_builder)
+    local_trav_dev = generate_local_travs(actx, local_tree, traversal_builder)
 
     if not tree_in_device_memory:
-        local_trav = local_trav_dev.get(queue=queue)
+        local_trav = actx.to_numpy(local_trav_dev)
     else:
-        local_trav = local_trav_dev.with_queue(None)
+        local_trav = actx.freeze(local_trav_dev)
 
     # }}}
 
@@ -269,7 +267,7 @@ class DistributedFMMRunner:
     .. automethod:: __init__
     .. automethod:: drive_dfmm
     """
-    def __init__(self, queue, global_tree,
+    def __init__(self, array_context: PyOpenCLArrayContext, global_tree,
                  traversal_builder,
                  wrangler_factory,
                  calibration_params=None, comm=MPI.COMM_WORLD):
@@ -292,7 +290,7 @@ def __init__(self, queue, global_tree,
         """
         self.wrangler, self.src_idx_all_ranks, self.tgt_idx_all_ranks = \
             make_distributed_wrangler(
-                queue, global_tree, traversal_builder, wrangler_factory,
+                array_context, global_tree, traversal_builder, wrangler_factory,
                 calibration_params, comm)
 
     def drive_dfmm(self, source_weights, timing_data=None):
diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py
index 3bda2d3b..22ad296d 100644
--- a/boxtree/distributed/calculation.py
+++ b/boxtree/distributed/calculation.py
@@ -29,13 +29,12 @@
 from mako.template import Template
 from mpi4py import MPI
 
-import pyopencl as cl
 from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
 from pytools import memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.distributed import MPITags
-from boxtree.fmm import ExpansionWranglerInterface
 from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler
 
 
@@ -44,35 +43,41 @@
 
 # {{{ Distributed FMM wrangler
 
-class DistributedExpansionWrangler(ExpansionWranglerInterface):
-    """Distributed expansion wrangler base class.
+class DistributedExpansionWranglerMixin:
+    """Distributed expansion wrangler helper class.
 
-    This is an abstract class and should not be directly instantiated. Instead, it is
-    expected that all distributed wranglers should be subclasses of this class.
+    This class is meant to aid in adding distributed capabilities to wranglers.
+    All distributed wranglers should inherit from this class.
+
+    .. attribute:: comm
+    .. attribute:: global_traversal
+    .. attribute:: communicate_mpoles_via_allreduce
 
-    .. automethod:: __init__
     .. automethod:: distribute_source_weights
     .. automethod:: gather_potential_results
     .. automethod:: communicate_mpoles
     """
-    def __init__(self, context, comm, global_traversal,
-                 traversal_in_device_memory,
-                 communicate_mpoles_via_allreduce=False):
-        self.context = context
-        self.comm = comm
-        self.global_traversal = global_traversal
-        self.traversal_in_device_memory = traversal_in_device_memory
-        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
 
-    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
+    @property
+    @memoize_method
+    def mpi_rank(self):
+        return self.comm.Get_rank()
+
+    @property
+    @memoize_method
+    def mpi_size(self):
+        return self.comm.Get_size()
+
+    @property
+    def is_mpi_root(self):
+        return self.mpi_rank == 0
 
-        if mpi_rank == 0:
+    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+        if self.is_mpi_root:
             distribute_weight_req = []
-            local_src_weight_vecs = np.empty((mpi_size,), dtype=object)
+            local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object)
 
-            for irank in range(mpi_size):
+            for irank in range(self.mpi_size):
                 local_src_weight_vecs[irank] = [
                     source_weights[src_idx_all_ranks[irank]]
                     for source_weights in src_weight_vecs]
@@ -91,22 +96,18 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
         return local_src_weight_vecs
 
     def gather_potential_results(self, potentials, tgt_idx_all_ranks):
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
-
         from boxtree.distributed import dtype_to_mpi
         potentials_mpi_type = dtype_to_mpi(potentials.dtype)
-
         gathered_potentials = None
 
-        if mpi_rank == 0:
+        if self.is_mpi_root:
             # The root rank received calculated potentials from all worker ranks
-            potentials_all_ranks = np.empty((mpi_size,), dtype=object)
+            potentials_all_ranks = np.empty((self.mpi_size,), dtype=object)
             potentials_all_ranks[0] = potentials
 
             recv_reqs = []
 
-            for irank in range(1, mpi_size):
+            for irank in range(1, self.mpi_size):
                 potentials_all_ranks[irank] = np.empty(
                     tgt_idx_all_ranks[irank].shape, dtype=potentials.dtype)
 
@@ -121,7 +122,7 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks):
             gathered_potentials = np.empty(
                 self.global_traversal.tree.ntargets, dtype=potentials.dtype)
 
-            for irank in range(mpi_size):
+            for irank in range(self.mpi_size):
                 gathered_potentials[tgt_idx_all_ranks[irank]] = (
                     potentials_all_ranks[irank])
         else:
@@ -135,8 +136,13 @@ def _slice_mpoles(self, mpoles, slice_indices):
         if len(slice_indices) == 0:
             return np.empty((0,), dtype=mpoles.dtype)
 
+        level_start_box_nrs = self.traversal.tree.level_start_box_nrs
+        if not isinstance(level_start_box_nrs, np.ndarray):
+            actx = self.tree_indep._setup_actx
+            level_start_box_nrs = actx.to_numpy(level_start_box_nrs)
+
         level_start_slice_indices = np.searchsorted(
-            slice_indices, self.traversal.tree.level_start_box_nrs)
+            slice_indices, level_start_box_nrs)
         mpoles_list = []
 
         for ilevel in range(self.traversal.tree.nlevels):
@@ -156,8 +162,13 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
         if len(slice_indices) == 0:
             return
 
+        level_start_box_nrs = self.traversal.tree.level_start_box_nrs
+        if not isinstance(level_start_box_nrs, np.ndarray):
+            actx = self.tree_indep._setup_actx
+            level_start_box_nrs = actx.to_numpy(level_start_box_nrs)
+
         level_start_slice_indices = np.searchsorted(
-            slice_indices, self.traversal.tree.level_start_box_nrs)
+            slice_indices, level_start_box_nrs)
         mpole_updates_start = 0
 
         for ilevel in range(self.traversal.tree.nlevels):
@@ -178,60 +189,61 @@ def _update_mpoles(self, mpoles, mpole_updates, slice_indices):
 
                 mpole_updates_start = mpole_updates_end
 
-    @memoize_method
-    def find_boxes_used_by_subrange_kernel(self, box_id_dtype):
-        return ElementwiseKernel(
-            self.context,
-            Template(r"""
-                ${box_id_t} *contributing_boxes_list,
-                int subrange_start,
-                int subrange_end,
-                ${box_id_t} *box_to_user_rank_starts,
-                int *box_to_user_rank_lists,
-                char *box_in_subrange
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype),
-            ),
-            Template(r"""
-                ${box_id_t} ibox = contributing_boxes_list[i];
-                ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
-                ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
-                for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
-                    int useri = box_to_user_rank_lists[iuser];
-                    if(subrange_start <= useri && useri < subrange_end) {
-                        box_in_subrange[i] = 1;
+    def find_boxes_used_by_subrange_kernel(self, actx, box_id_dtype):
+        from pytools import memoize_in
+
+        @memoize_in(actx, (type(self), box_id_dtype))
+        def get_kernel():
+            return ElementwiseKernel(
+                actx.context,
+                Template(r"""
+                    ${box_id_t} *contributing_boxes_list,
+                    int subrange_start,
+                    int subrange_end,
+                    ${box_id_t} *box_to_user_rank_starts,
+                    int *box_to_user_rank_lists,
+                    char *box_in_subrange
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype),
+                ),
+                Template(r"""
+                    ${box_id_t} ibox = contributing_boxes_list[i];
+                    ${box_id_t} iuser_start = box_to_user_rank_starts[ibox];
+                    ${box_id_t} iuser_end = box_to_user_rank_starts[ibox + 1];
+                    for(${box_id_t} iuser = iuser_start; iuser < iuser_end; iuser++) {
+                        int useri = box_to_user_rank_lists[iuser];
+                        if(subrange_start <= useri && useri < subrange_end) {
+                            box_in_subrange[i] = 1;
+                        }
                     }
-                }
-            """).render(
-                box_id_t=dtype_to_ctype(box_id_dtype)
-            ),
-            "find_boxes_used_by_subrange"
-        )
+                """).render(
+                    box_id_t=dtype_to_ctype(box_id_dtype)
+                ),
+                "find_boxes_used_by_subrange"
+            )
+
+        return get_kernel()
 
     def find_boxes_used_by_subrange(
-            self, subrange, box_to_user_rank_starts, box_to_user_rank_lists,
+            self, actx: PyOpenCLArrayContext,
+            subrange, box_to_user_rank_starts, box_to_user_rank_lists,
             contributing_boxes_list):
         """Test whether the multipole expansions of the contributing boxes are used
         by at least one box in a range.
 
         :arg subrange: the range is represented by ``(subrange[0], subrange[1])``.
-        :arg box_to_user_rank_starts: a :class:`pyopencl.array.Array` object
-            indicating the start and end index in *box_to_user_rank_lists* for each
+        :arg box_to_user_rank_starts: an array object indicating the start and
+            end index in *box_to_user_rank_lists* for each box in
+            *contributing_boxes_list*.
+        :arg box_to_user_rank_lists: an array object storing the users of each
             box in *contributing_boxes_list*.
-        :arg box_to_user_rank_lists: a :class:`pyopencl.array.Array` object storing
-            the users of each box in *contributing_boxes_list*.
-        :returns: a :class:`pyopencl.array.Array` object with the same shape as
-            *contributing_boxes_list*, where the i-th entry is 1 if
-            ``contributing_boxes_list[i]`` is used by at least on box in the
-            subrange specified.
+        :returns: an array object with the same shape as *contributing_boxes_list*,
+            where the i-th entry is 1 if ``contributing_boxes_list[i]`` is used
+            by at least on box in the subrange specified.
         """
-        box_in_subrange = cl.array.zeros(
-            contributing_boxes_list.queue,
-            contributing_boxes_list.shape[0],
-            dtype=np.int8
-        )
+        box_in_subrange = actx.zeros(contributing_boxes_list.shape[0], dtype=np.int8)
         knl = self.find_boxes_used_by_subrange_kernel(
-                self.traversal.tree.box_id_dtype)
+                actx, self.traversal.tree.box_id_dtype)
 
         knl(
             contributing_boxes_list,
@@ -244,7 +256,8 @@ def find_boxes_used_by_subrange(
 
         return box_in_subrange
 
-    def communicate_mpoles(self, mpole_exps, return_stats=False):
+    def communicate_mpoles(self, actx: PyOpenCLArrayContext,
+                           mpole_exps, return_stats=False):
         """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
 
         The main idea is to mimic an allreduce as done on a hypercube network, but to
@@ -253,12 +266,11 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
 
         .. [1] Lashuk, Ilya, Aparna Chandramowlishwaran, Harper Langston,
             Tuan-Anh Nguyen, Rahul Sampath, Aashay Shringarpure, Richard Vuduc,
-            Lexing Ying, Denis Zorin, and George Biros. “A massively parallel
-            adaptive fast multipole method on heterogeneous architectures."
-            Communications of the ACM 55, no. 5 (2012): 101-109.
+            Lexing Ying, Denis Zorin, and George Biros. "A massively parallel
+            adaptive fast multipole method on heterogeneous architectures",
+            Communications of the ACM 55, no. 5 (2012): 101-109,
+            `DOI <https://doi.org/10.1145/1654059.1654118>`__.
         """
-        mpi_rank = self.comm.Get_rank()
-        mpi_size = self.comm.Get_size()
         tree = self.traversal.tree
 
         if self.communicate_mpoles_via_allreduce:
@@ -284,16 +296,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         # Initially, this set consists of the boxes satisfying condition (a), which
         # are precisely the boxes owned by this process and their ancestors.
         if self.traversal_in_device_memory:
-            with cl.CommandQueue(self.context) as queue:
-                contributing_boxes = tree.ancestor_mask.get(queue=queue)
-                responsible_boxes_list = tree.responsible_boxes_list.get(queue=queue)
+            contributing_boxes = actx.to_numpy(tree.ancestor_mask)
+            responsible_boxes_list = actx.to_numpy(tree.responsible_boxes_list)
         else:
-            contributing_boxes = tree.ancestor_mask.copy()
+            contributing_boxes = np.copy(tree.ancestor_mask)
             responsible_boxes_list = tree.responsible_boxes_list
         contributing_boxes[responsible_boxes_list] = 1
 
         from boxtree.tools import AllReduceCommPattern
-        comm_pattern = AllReduceCommPattern(mpi_rank, mpi_size)
+        comm_pattern = AllReduceCommPattern(self.mpi_rank, self.mpi_size)
 
         # Temporary buffers for receiving data
         mpole_exps_buf = np.empty(mpole_exps.shape, dtype=mpole_exps.dtype)
@@ -303,15 +314,13 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         stats["bytes_recvd_by_stage"] = []
 
         if self.traversal_in_device_memory:
-            box_to_user_rank_starts_dev = \
-                tree.box_to_user_rank_starts.with_queue(None)
-            box_to_user_rank_lists_dev = tree.box_to_user_rank_lists.with_queue(None)
+            box_to_user_rank_starts_dev = actx.freeze(tree.box_to_user_rank_starts)
+            box_to_user_rank_lists_dev = actx.freeze(tree.box_to_user_rank_lists)
         else:
-            with cl.CommandQueue(self.context) as queue:
-                box_to_user_rank_starts_dev = cl.array.to_device(
-                    queue, tree.box_to_user_rank_starts).with_queue(None)
-                box_to_user_rank_lists_dev = cl.array.to_device(
-                    queue, tree.box_to_user_rank_lists).with_queue(None)
+            box_to_user_rank_starts_dev = actx.freeze(
+                actx.from_numpy(tree.box_to_user_rank_starts))
+            box_to_user_rank_lists_dev = actx.freeze(
+                actx.from_numpy(tree.box_to_user_rank_lists))
 
         while not comm_pattern.done():
             send_requests = []
@@ -325,18 +334,15 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
                     tree.box_id_dtype
                 )
 
-                with cl.CommandQueue(self.context) as queue:
-                    contributing_boxes_list_dev = cl.array.to_device(
-                        queue, contributing_boxes_list)
-
-                    box_in_subrange = self.find_boxes_used_by_subrange(
-                        message_subrange,
-                        box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
-                        contributing_boxes_list_dev
-                    )
-
-                    box_in_subrange_host = box_in_subrange.get().astype(bool)
+                contributing_boxes_list_dev = actx.from_numpy(
+                    contributing_boxes_list)
+                box_in_subrange = self.find_boxes_used_by_subrange(
+                    actx, message_subrange,
+                    box_to_user_rank_starts_dev, box_to_user_rank_lists_dev,
+                    contributing_boxes_list_dev
+                )
 
+                box_in_subrange_host = actx.to_numpy(box_in_subrange).astype(bool)
                 relevant_boxes_list = contributing_boxes_list[
                     box_in_subrange_host
                 ].astype(tree.box_id_dtype)
@@ -385,7 +391,7 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
 
                 # Update data structures.
                 self._update_mpoles(
-                        mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
+                    mpole_exps, mpole_exps_buf, boxes_list_buf[:nboxes])
 
                 contributing_boxes[boxes_list_buf[:nboxes]] = 1
 
@@ -397,38 +403,43 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):
         if return_stats:
             return stats
 
-    def finalize_potentials(self, potentials, template_ary):
-        if self.comm.Get_rank() == 0:
-            return super().finalize_potentials(potentials, template_ary)
-        else:
-            return None
-
 
 class DistributedFMMLibExpansionWrangler(
-        DistributedExpansionWrangler, FMMLibExpansionWrangler):
+            DistributedExpansionWranglerMixin,
+            FMMLibExpansionWrangler):
     def __init__(
-            self, context, comm, tree_indep, local_traversal, global_traversal,
+            self, comm, tree_indep, local_traversal, global_traversal,
             fmm_level_to_order=None,
             communicate_mpoles_via_allreduce=False,
             **kwargs):
-        DistributedExpansionWrangler.__init__(
-            self, context, comm, global_traversal, False,
-            communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
         FMMLibExpansionWrangler.__init__(
             self, tree_indep, local_traversal,
             fmm_level_to_order=fmm_level_to_order, **kwargs)
 
+        self.comm = comm
+        self.traversal_in_device_memory = False
+        self.global_traversal = global_traversal
+        self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce
+
     # TODO: use log_process like FMMLibExpansionWrangler?
     def reorder_sources(self, source_array):
-        if self.comm.Get_rank() == 0:
+        if self.is_mpi_root:
             return source_array[..., self.global_traversal.tree.user_source_ids]
         else:
             return None
 
     def reorder_potentials(self, potentials):
-        if self.comm.Get_rank() == 0:
+        if self.is_mpi_root:
             return potentials[self.global_traversal.tree.sorted_target_ids]
         else:
             return None
 
+    def finalize_potentials(self, potentials, template_ary):
+        if self.is_mpi_root:
+            return super().finalize_potentials(potentials, template_ary)
+        else:
+            return None
+
 # }}}
+
+# vim: fdm=marker
diff --git a/boxtree/distributed/local_traversal.py b/boxtree/distributed/local_traversal.py
index 6c6fbc4a..dff4ee7e 100644
--- a/boxtree/distributed/local_traversal.py
+++ b/boxtree/distributed/local_traversal.py
@@ -29,34 +29,30 @@
 
 
 def generate_local_travs(
-        queue, local_tree, traversal_builder, merge_close_lists=False):
+        actx, local_tree, traversal_builder, merge_close_lists=False):
     """Generate local traversal from local tree.
 
-    :arg queue: a :class:`pyopencl.CommandQueue` object.
-    :arg local_tree: the local tree of class
-        `boxtree.tools.ImmutableHostDeviceArray` on which the local traversal
-        object will be constructed.
-    :arg traversal_builder: a function, taken a :class:`pyopencl.CommandQueue` and
-        a tree, returns the traversal object based on the tree.
+    :arg local_tree: the local tree on which the local traversal object will
+        be constructed.
+    :arg traversal_builder: a function, taken a :class:`arraycontext.ArrayContext`
+        and a tree, returns the traversal object based on the tree.
 
     :return: generated local traversal object in device memory
     """
     start_time = time.time()
 
-    local_tree.with_queue(queue)
-
     # We need `source_boxes_mask` and `source_parent_boxes_mask` here to restrict the
     # multipole formation and upward propagation within the rank's responsible boxes
     # region. Had there not been such restrictions, some sources might be distributed
     # to more than 1 rank and counted multiple times.
     local_trav, _ = traversal_builder(
-        queue, local_tree.to_device(queue),
-        source_boxes_mask=local_tree.responsible_boxes_mask.device,
-        source_parent_boxes_mask=local_tree.ancestor_mask.device
+        actx, local_tree,
+        source_boxes_mask=local_tree.responsible_boxes_mask,
+        source_parent_boxes_mask=local_tree.ancestor_mask
     )
 
     if merge_close_lists and local_tree.targets_have_extent:
-        local_trav = local_trav.merge_close_lists(queue)
+        local_trav = local_trav.merge_close_lists(actx)
 
     logger.info("Generate local traversal in %f sec.", time.time() - start_time)
 
diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py
index 91eacae9..1a5bb1a6 100644
--- a/boxtree/distributed/local_tree.py
+++ b/boxtree/distributed/local_tree.py
@@ -26,13 +26,15 @@
 from dataclasses import dataclass
 
 import numpy as np
+from arraycontext import Array, ArrayOrContainer
 from mako.template import Template
 
-import pyopencl as cl
+from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
 from pytools import memoize_method
 
 from boxtree import Tree
+from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
 
 
 logger = logging.getLogger(__name__)
@@ -48,16 +50,21 @@ class LocalTreeGeneratorCodeContainer:
     """Objects of this type serve as a place to keep the code needed for
     :func:`generate_local_tree`.
     """
-    def __init__(self, cl_context, dimensions, particle_id_dtype, coord_dtype):
-        self.cl_context = cl_context
+    def __init__(self, array_context: PyOpenCLArrayContext,
+                 dimensions, particle_id_dtype, coord_dtype):
+        self._setup_actx = array_context
         self.dimensions = dimensions
         self.particle_id_dtype = particle_id_dtype
         self.coord_dtype = coord_dtype
 
+    @property
+    def context(self):
+        return self._setup_actx.context
+
     @memoize_method
     def particle_mask_kernel(self):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
+        return ElementwiseKernel(
+            self.context,
             arguments=Template("""
                 __global char *responsible_boxes,
                 __global ${particle_id_t} *box_particle_starts,
@@ -82,7 +89,7 @@ def particle_mask_kernel(self):
     def mask_scan_kernel(self):
         from pyopencl.scan import GenericScanKernel
         return GenericScanKernel(
-            self.cl_context, self.particle_id_dtype,
+            self.context, self.particle_id_dtype,
             arguments=Template("""
                 __global ${mask_t} *ary,
                 __global ${mask_t} *scan
@@ -123,8 +130,8 @@ def mask_scan_kernel(self):
 
     @memoize_method
     def fetch_local_particles_kernel(self, particles_have_extent):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
+        return ElementwiseKernel(
+            self.context,
             self.fetch_local_particles_arguments.render(
                 mask_t=dtype_to_ctype(self.particle_id_dtype),
                 coord_t=dtype_to_ctype(self.coord_dtype),
@@ -141,15 +148,15 @@ def fetch_local_particles_kernel(self, particles_have_extent):
     @memoize_method
     def mask_compressor_kernel(self):
         from boxtree.tools import MaskCompressorKernel
-        return MaskCompressorKernel(self.cl_context)
+        return MaskCompressorKernel(self._setup_actx)
 
     @memoize_method
     def modify_target_flags_kernel(self):
         from boxtree import box_flags_enum
         box_flag_t = dtype_to_ctype(box_flags_enum.dtype)
 
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
+        return ElementwiseKernel(
+            self.context,
             Template("""
                 __global ${particle_id_t} *box_target_counts_nonchild,
                 __global ${particle_id_t} *box_target_counts_cumul,
@@ -173,18 +180,19 @@ def modify_target_flags_kernel(self):
         )
 
 
-@dataclass
+@dataclass(frozen=True)
 class LocalParticlesAndLists:
-    particles: np.ndarray
-    particle_radii: cl.array.Array | None
-    box_particle_starts: cl.array.Array
-    box_particle_counts_nonchild: cl.array.Array
-    box_particle_counts_cumul: cl.array.Array
+    particles: ArrayOrContainer
+    particle_radii: Array | None
+    box_particle_starts: Array
+    box_particle_counts_nonchild: Array
+    box_particle_counts_cumul: Array
     particle_idx: np.ndarray
 
 
 def construct_local_particles_and_lists(
-        queue, code, dimensions, num_boxes, num_global_particles,
+        actx: PyOpenCLArrayContext,
+        code, dimensions, num_boxes, num_global_particles,
         particle_id_dtype, coord_dtype, particles_have_extent,
         box_mask,
         global_particles, global_particle_radii,
@@ -195,18 +203,19 @@ def construct_local_particles_and_lists(
     """
     # {{{ calculate the particle mask
 
-    particle_mask = cl.array.zeros(
-        queue, num_global_particles, dtype=particle_id_dtype)
-
+    particle_mask = actx.zeros(num_global_particles, dtype=particle_id_dtype)
     code.particle_mask_kernel()(
-        box_mask, box_particle_starts, box_particle_counts_nonchild, particle_mask)
+        box_mask,
+        box_particle_starts,
+        box_particle_counts_nonchild,
+        particle_mask)
 
     # }}}
 
     # {{{ calculate the scan of the particle mask
 
-    global_to_local_particle_index = cl.array.empty(
-        queue, num_global_particles + 1, dtype=particle_id_dtype)
+    global_to_local_particle_index = actx.np.zeros(
+        num_global_particles + 1, dtype=particle_id_dtype)
 
     global_to_local_particle_index[0] = 0
     code.mask_scan_kernel()(particle_mask, global_to_local_particle_index)
@@ -215,19 +224,18 @@ def construct_local_particles_and_lists(
 
     # {{{ fetch the local particles
 
-    num_local_particles = global_to_local_particle_index[-1].get(queue).item()
-
-    local_particles = [
-        cl.array.empty(queue, num_local_particles, dtype=coord_dtype)
-        for _ in range(dimensions)]
+    from pytools.obj_array import make_obj_array
+    num_local_particles = actx.to_numpy(global_to_local_particle_index[-1]).item()
+    local_particles = make_obj_array([
+        actx.zeros(num_local_particles, coord_dtype)
+        for _ in range(dimensions)
+        ])
 
     from pytools.obj_array import make_obj_array
     local_particles = make_obj_array(local_particles)
 
-    local_particle_radii = None
     if particles_have_extent:
-        local_particle_radii = cl.array.empty(
-            queue, num_local_particles, dtype=coord_dtype)
+        local_particle_radii = actx.np.zeros(num_local_particles, dtype=coord_dtype)
 
         code.fetch_local_particles_kernel(True)(
             particle_mask, global_to_local_particle_index,
@@ -236,6 +244,7 @@ def construct_local_particles_and_lists(
             global_particle_radii,
             local_particle_radii)
     else:
+        local_particle_radii = None
         code.fetch_local_particles_kernel(False)(
             particle_mask, global_to_local_particle_index,
             *global_particles.tolist(),
@@ -245,9 +254,9 @@ def construct_local_particles_and_lists(
 
     local_box_particle_starts = global_to_local_particle_index[box_particle_starts]
 
-    box_counts_all_zeros = cl.array.zeros(queue, num_boxes, dtype=particle_id_dtype)
+    box_counts_all_zeros = actx.zeros(num_boxes, dtype=particle_id_dtype)
 
-    local_box_particle_counts_nonchild = cl.array.if_positive(
+    local_box_particle_counts_nonchild = actx.np.where(
         box_mask, box_particle_counts_nonchild, box_counts_all_zeros)
 
     box_particle_ends_cumul = box_particle_starts + box_particle_counts_cumul
@@ -258,18 +267,20 @@ def construct_local_particles_and_lists(
 
     # }}}
 
-    particle_mask = particle_mask.get(queue=queue).astype(bool)
+    particle_mask = actx.to_numpy(particle_mask).astype(bool)
     particle_idx = np.arange(num_global_particles)[particle_mask]
 
     return LocalParticlesAndLists(
-        local_particles,
-        local_particle_radii,
-        local_box_particle_starts,
-        local_box_particle_counts_nonchild,
-        local_box_particle_counts_cumul,
-        particle_idx)
+        particles=local_particles,
+        particle_radii=local_particle_radii,
+        box_particle_starts=local_box_particle_starts,
+        box_particle_counts_nonchild=local_box_particle_counts_nonchild,
+        box_particle_counts_cumul=local_box_particle_counts_cumul,
+        particle_idx=particle_idx)
 
 
+@dataclass_array_container
+@dataclass(frozen=True)
 class LocalTree(Tree):
     """
     Inherits from :class:`boxtree.Tree`.
@@ -288,13 +299,21 @@ class LocalTree(Tree):
         propagated from an ancestor) List 2.
     """
 
+    box_to_user_rank_starts: Array
+    box_to_user_rank_lists: Array
+
+    responsible_boxes_list: Array
+    responsible_boxes_mask: Array
+    ancestor_mask: Array
+
 
-def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
+def generate_local_tree(
+        actx: PyOpenCLArrayContext,
+        global_traversal, responsible_boxes_list, comm):
     """Generate the local tree for the current rank.
 
     This is an MPI-collective routine on *comm*.
 
-    :arg queue: a :class:`pyopencl.CommandQueue` object.
     :arg global_traversal: Global :class:`boxtree.traversal.FMMTraversalInfo` object
         on host memory.
     :arg responsible_boxes_list: a :class:`numpy.ndarray` object containing the
@@ -307,9 +326,9 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
         global tree. ``src_idx`` and ``tgt_idx`` are needed for distributing source
         weights from root rank and assembling calculated potentials on the root rank.
     """
-    global_tree = global_traversal.tree
+    global_tree = actx.thaw(global_traversal.tree)
     code = LocalTreeGeneratorCodeContainer(
-            queue.context, global_tree.dimensions,
+            actx, global_tree.dimensions,
             global_tree.particle_id_dtype, global_tree.coord_dtype)
 
     mpi_rank = comm.Get_rank()
@@ -318,33 +337,31 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
     start_time = time.time()
 
     from boxtree.distributed.partition import get_box_masks
-    box_masks = get_box_masks(queue, global_traversal, responsible_boxes_list)
-
-    global_tree_dev = global_tree.to_device(queue).with_queue(queue)
+    box_masks = get_box_masks(actx, global_traversal, responsible_boxes_list)
 
     local_sources_and_lists = construct_local_particles_and_lists(
-        queue, code, global_tree.dimensions, global_tree.nboxes,
+        actx, code, global_tree.dimensions, global_tree.nboxes,
         global_tree.nsources,
         global_tree.particle_id_dtype, global_tree.coord_dtype,
         global_tree.sources_have_extent,
         box_masks.point_src_boxes,
-        global_tree_dev.sources,
-        global_tree_dev.sources_radii if global_tree.sources_have_extent else None,
-        global_tree_dev.box_source_starts,
-        global_tree_dev.box_source_counts_nonchild,
-        global_tree_dev.box_source_counts_cumul)
+        global_tree.sources,
+        global_tree.sources_radii if global_tree.sources_have_extent else None,
+        global_tree.box_source_starts,
+        global_tree.box_source_counts_nonchild,
+        global_tree.box_source_counts_cumul)
 
     local_targets_and_lists = construct_local_particles_and_lists(
-        queue, code, global_tree.dimensions, global_tree.nboxes,
+        actx, code, global_tree.dimensions, global_tree.nboxes,
         global_tree.ntargets,
         global_tree.particle_id_dtype, global_tree.coord_dtype,
         global_tree.targets_have_extent,
         box_masks.responsible_boxes,
-        global_tree_dev.targets,
-        global_tree_dev.target_radii if global_tree.targets_have_extent else None,
-        global_tree_dev.box_target_starts,
-        global_tree_dev.box_target_counts_nonchild,
-        global_tree_dev.box_target_counts_cumul)
+        global_tree.targets,
+        global_tree.target_radii if global_tree.targets_have_extent else None,
+        global_tree.box_target_starts,
+        global_tree.box_target_counts_nonchild,
+        global_tree.box_target_counts_cumul)
 
     # {{{ compute the users of multipole expansions of each box on the root rank
 
@@ -354,24 +371,26 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
             (mpi_size, global_tree.nboxes),
             dtype=box_masks.multipole_src_boxes.dtype)
     comm.Gather(
-        box_masks.multipole_src_boxes.get(), multipole_src_boxes_all_ranks, root=0)
+        actx.to_numpy(box_masks.multipole_src_boxes),
+        multipole_src_boxes_all_ranks, root=0)
 
     box_to_user_rank_starts = None
     box_to_user_rank_lists = None
 
     if mpi_rank == 0:
-        multipole_src_boxes_all_ranks = cl.array.to_device(
-            queue, multipole_src_boxes_all_ranks)
+        multipole_src_boxes_all_ranks = actx.from_numpy(
+            multipole_src_boxes_all_ranks)
 
         (box_to_user_rank_starts, box_to_user_rank_lists, evt) = \
             code.mask_compressor_kernel()(
-                queue, multipole_src_boxes_all_ranks.transpose(),
+                actx, multipole_src_boxes_all_ranks.transpose(),
                 list_dtype=np.int32)
 
-        cl.wait_for_events([evt])
+        from pyopencl import wait_for_events
+        wait_for_events([evt])
 
-        box_to_user_rank_starts = box_to_user_rank_starts.get()
-        box_to_user_rank_lists = box_to_user_rank_lists.get()
+        box_to_user_rank_starts = actx.to_numpy(box_to_user_rank_starts)
+        box_to_user_rank_lists = actx.to_numpy(box_to_user_rank_lists)
 
         logger.debug("computing box_to_user: done")
 
@@ -388,7 +407,7 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
     # expansions formed by sources in other ranks. Modifying the source box flags
     # could result in incomplete interaction lists.
 
-    local_box_flags = global_tree_dev.box_flags.copy(queue=queue)
+    local_box_flags = actx.np.copy(global_tree.box_flags)
     code.modify_target_flags_kernel()(
         local_targets_and_lists.box_particle_counts_nonchild,
         local_targets_and_lists.box_particle_counts_cumul,
@@ -396,14 +415,6 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
 
     # }}}
 
-    from pytools.obj_array import make_obj_array
-    local_sources = make_obj_array([
-        local_sources_idim.get(queue=queue)
-        for local_sources_idim in local_sources_and_lists.particles])
-    local_targets = make_obj_array([
-        local_target_idim.get(queue=queue)
-        for local_target_idim in local_targets_and_lists.particles])
-
     local_tree = LocalTree(
         sources_are_targets=global_tree.sources_are_targets,
         sources_have_extent=global_tree.sources_have_extent,
@@ -420,33 +431,34 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
 
         bounding_box=global_tree.bounding_box,
         level_start_box_nrs=global_tree.level_start_box_nrs,
-        level_start_box_nrs_dev=global_tree.level_start_box_nrs_dev,
 
-        sources=local_sources,
-        targets=local_targets,
-        source_radii=(local_sources_and_lists.particle_radii.get(queue=queue)
+        sources=local_sources_and_lists.particles,
+        targets=local_targets_and_lists.particles,
+        source_radii=(
+                local_sources_and_lists.particle_radii
                 if global_tree.sources_have_extent else None),
-        target_radii=(local_targets_and_lists.particle_radii.get(queue=queue)
+        target_radii=(
+                local_targets_and_lists.particle_radii
                 if global_tree.targets_have_extent else None),
 
         box_source_starts=(
-            local_sources_and_lists.box_particle_starts.get(queue=queue)),
+            local_sources_and_lists.box_particle_starts),
         box_source_counts_nonchild=(
-            local_sources_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+            local_sources_and_lists.box_particle_counts_nonchild),
         box_source_counts_cumul=(
-            local_sources_and_lists.box_particle_counts_cumul.get(queue=queue)),
+            local_sources_and_lists.box_particle_counts_cumul),
         box_target_starts=(
-            local_targets_and_lists.box_particle_starts.get(queue=queue)),
+            local_targets_and_lists.box_particle_starts),
         box_target_counts_nonchild=(
-            local_targets_and_lists.box_particle_counts_nonchild.get(queue=queue)),
+            local_targets_and_lists.box_particle_counts_nonchild),
         box_target_counts_cumul=(
-            local_targets_and_lists.box_particle_counts_cumul.get(queue=queue)),
+            local_targets_and_lists.box_particle_counts_cumul),
 
         box_parent_ids=global_tree.box_parent_ids,
         box_child_ids=global_tree.box_child_ids,
         box_centers=global_tree.box_centers,
         box_levels=global_tree.box_levels,
-        box_flags=local_box_flags.get(queue=queue),
+        box_flags=local_box_flags,
 
         user_source_ids=None,
         sorted_target_ids=None,
@@ -459,19 +471,19 @@ def generate_local_tree(queue, global_traversal, responsible_boxes_list, comm):
         _is_pruned=global_tree._is_pruned,
 
         responsible_boxes_list=responsible_boxes_list,
-        responsible_boxes_mask=box_masks.responsible_boxes.get(),
-        ancestor_mask=box_masks.ancestor_boxes.get(),
-        box_to_user_rank_starts=box_to_user_rank_starts,
-        box_to_user_rank_lists=box_to_user_rank_lists
+        responsible_boxes_mask=box_masks.responsible_boxes,
+        ancestor_mask=box_masks.ancestor_boxes,
+        box_to_user_rank_starts=actx.from_numpy(box_to_user_rank_starts),
+        box_to_user_rank_lists=actx.from_numpy(box_to_user_rank_lists),
     )
 
-    local_tree = local_tree.to_host_device_array(queue)
-    local_tree.with_queue(None)
-
-    logger.info("Generate local tree on rank %d in %f sec.",
-            mpi_rank, time.time() - start_time)
+    logger.info("Generate local tree on rank %d in %s sec.",
+        mpi_rank, time.time() - start_time
+    )
 
     return (
-        local_tree,
+        actx.freeze(local_tree),
         local_sources_and_lists.particle_idx,
         local_targets_and_lists.particle_idx)
+
+# vim: fdm=marker
diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py
index d646f61b..95f40037 100644
--- a/boxtree/distributed/partition.py
+++ b/boxtree/distributed/partition.py
@@ -24,12 +24,15 @@
 from dataclasses import dataclass
 
 import numpy as np
+from arraycontext import Array
 from mako.template import Template
 
-import pyopencl as cl
+from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
 from pytools import memoize_method
 
+from boxtree.array_context import PyOpenCLArrayContext
+
 
 def get_box_ids_dfs_order(tree):
     """Helper function for getting box ids of a tree in depth-first order.
@@ -118,17 +121,21 @@ def partition_work(cost_per_box, traversal, comm):
 
 
 class GetBoxMasksCodeContainer:
-    def __init__(self, cl_context, box_id_dtype):
-        self.cl_context = cl_context
+    def __init__(self, array_context: PyOpenCLArrayContext, box_id_dtype):
+        self._setup_actx = array_context
         self.box_id_dtype = box_id_dtype
 
+    @property
+    def context(self):
+        return self._setup_actx.context
+
     @memoize_method
     def add_interaction_list_boxes_kernel(self):
         """Given a ``responsible_boxes_mask`` and an interaction list, mark source
         boxes for target boxes in ``responsible_boxes_mask`` in a new separate mask.
         """
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
+        return ElementwiseKernel(
+            self.context,
             Template("""
                 __global ${box_id_t} *box_list,
                 __global char *responsible_boxes_mask,
@@ -154,29 +161,28 @@ def add_interaction_list_boxes_kernel(self):
 
     @memoize_method
     def add_parent_boxes_kernel(self):
-        return cl.elementwise.ElementwiseKernel(
-            self.cl_context,
+        return ElementwiseKernel(
+            self.context,
             "__global char *current, __global char *parent, "
             f"__global {dtype_to_ctype(self.box_id_dtype)} *box_parent_ids",
             "if(i != 0 && current[i]) parent[box_parent_ids[i]] = 1"
         )
 
 
-def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
+def get_ancestor_boxes_mask(actx, code, traversal, responsible_boxes_mask):
     """Query the ancestors of responsible boxes.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
-        i-th entry is 1 if ``i`` is an ancestor of the responsible boxes specified by
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if ``i``
+        is an ancestor of the responsible boxes specified by
         *responsible_boxes_mask*.
     """
-    ancestor_boxes = cl.array.zeros(queue, (traversal.tree.nboxes,), dtype=np.int8)
+    ancestor_boxes = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
     ancestor_boxes_last = responsible_boxes_mask.copy()
 
     while ancestor_boxes_last.any():
-        ancestor_boxes_new = cl.array.zeros(
-            queue, (traversal.tree.nboxes,), dtype=np.int8)
+        ancestor_boxes_new = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
         code.add_parent_boxes_kernel()(
             ancestor_boxes_last, ancestor_boxes_new, traversal.tree.box_parent_ids)
         ancestor_boxes_new = ancestor_boxes_new & (~ancestor_boxes)
@@ -187,18 +193,18 @@ def get_ancestor_boxes_mask(queue, code, traversal, responsible_boxes_mask):
 
 
 def get_point_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+        actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
     """Query the boxes whose sources are needed in order to evaluate potentials
     of boxes represented by *responsible_boxes_mask*.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :param ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
-        or an ancestor of the responsible boxes.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)`` whose
-        i-th entry is 1 if sources of box ``i`` are needed for evaluating the
-        potentials of targets in boxes represented by *responsible_boxes_mask*.
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :param ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is either a responsible box or an ancestor
+        of the responsible boxes.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if
+        sources of box ``i`` are needed for evaluating the potentials of targets
+        in boxes represented by *responsible_boxes_mask*.
     """
 
     src_boxes_mask = responsible_boxes_mask.copy()
@@ -208,7 +214,7 @@ def get_point_src_boxes_mask(
         traversal.target_boxes, responsible_boxes_mask,
         traversal.neighbor_source_boxes_starts,
         traversal.neighbor_source_boxes_lists, src_boxes_mask,
-        queue=queue)
+        queue=actx.queue)
 
     # Add list 4 of responsible boxes or ancestor boxes
     code.add_interaction_list_boxes_kernel()(
@@ -216,7 +222,7 @@ def get_point_src_boxes_mask(
         responsible_boxes_mask | ancestor_boxes_mask,
         traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists,
         src_boxes_mask,
-        queue=queue)
+        queue=actx.queue)
 
     if traversal.tree.targets_have_extent:
         # Add list 3 close of responsible boxes
@@ -227,7 +233,7 @@ def get_point_src_boxes_mask(
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
                 src_boxes_mask,
-                queue=queue
+                queue=actx.queue
             )
 
         # Add list 4 close of responsible boxes
@@ -238,30 +244,28 @@ def get_point_src_boxes_mask(
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
                 src_boxes_mask,
-                queue=queue
+                queue=actx.queue
             )
 
     return src_boxes_mask
 
 
 def get_multipole_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
+        actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask):
     """Query the boxes whose multipoles are used in order to evaluate
     potentials of targets in boxes represented by *responsible_boxes_mask*.
 
-    :arg responsible_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is a responsible box.
-    :arg ancestor_boxes_mask: A :class:`pyopencl.array.Array` object of shape
-        ``(tree.nboxes,)`` whose i-th entry is 1 if ``i`` is either a responsible box
-        or an ancestor of the responsible boxes.
-    :return: A :class:`pyopencl.array.Array` object of shape ``(tree.nboxes,)``
-        whose i-th entry is 1 if multipoles of box ``i`` are needed for evaluating
-        the potentials of targets in boxes represented by *responsible_boxes_mask*.
+    :arg responsible_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is a responsible box.
+    :arg ancestor_boxes_mask: an array of shape ``(tree.nboxes,)`` whose
+        i-th entry is 1 if ``i`` is either a responsible box or an ancestor of
+        the responsible boxes.
+    :return: an array of shape ``(tree.nboxes,)`` whose i-th entry is 1 if
+        multipoles of box ``i`` are needed for evaluating the potentials of
+        targets in boxes represented by *responsible_boxes_mask*.
     """
 
-    multipole_boxes_mask = cl.array.zeros(
-        queue, (traversal.tree.nboxes,), dtype=np.int8
-    )
+    multipole_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
 
     # A mpole is used by process p if it is in the List 2 of either a box
     # owned by p or one of its ancestors.
@@ -271,7 +275,7 @@ def get_multipole_src_boxes_mask(
         traversal.from_sep_siblings_starts,
         traversal.from_sep_siblings_lists,
         multipole_boxes_mask,
-        queue=queue
+        queue=actx.queue
     )
     multipole_boxes_mask.finish()
 
@@ -283,7 +287,7 @@ def get_multipole_src_boxes_mask(
             traversal.from_sep_smaller_by_level[ilevel].starts,
             traversal.from_sep_smaller_by_level[ilevel].lists,
             multipole_boxes_mask,
-            queue=queue
+            queue=actx.queue
         )
 
         multipole_boxes_mask.finish()
@@ -291,11 +295,11 @@ def get_multipole_src_boxes_mask(
     return multipole_boxes_mask
 
 
-@dataclass
+@dataclass(frozen=True)
 class BoxMasks:
     """
-    Box masks needed for the distributed calculation. Each of these masks is a
-    PyOpenCL array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
+    Box masks needed for the distributed calculation. Each of these masks is an
+    array with length ``tree.nboxes``, whose `i`-th entry is 1 if box `i` is
     set.
 
     .. attribute:: responsible_boxes
@@ -315,13 +319,13 @@ class BoxMasks:
 
         Current process needs multipole expressions in these boxes.
     """
-    responsible_boxes: cl.array.Array
-    ancestor_boxes: cl.array.Array
-    point_src_boxes: cl.array.Array
-    multipole_src_boxes: cl.array.Array
+    responsible_boxes: Array
+    ancestor_boxes: Array
+    point_src_boxes: Array
+    multipole_src_boxes: Array
 
 
-def get_box_masks(queue, traversal, responsible_boxes_list):
+def get_box_masks(actx, traversal, responsible_boxes_list):
     """Given the responsible boxes for a rank, this helper function calculates the
     relevant masks.
 
@@ -329,27 +333,23 @@ def get_box_masks(queue, traversal, responsible_boxes_list):
 
     :returns: A :class:`BoxMasks` object of the relevant masks.
     """
-    code = GetBoxMasksCodeContainer(queue.context, traversal.tree.box_id_dtype)
-
-    # FIXME: It is wasteful to copy the whole traversal object into device memory
-    # here because
-    # 1) Not all fields are needed.
-    # 2) For sumpy wrangler, a device traversal object is already available.
-    traversal = traversal.to_device(queue)
+    code = GetBoxMasksCodeContainer(actx, traversal.tree.box_id_dtype)
 
-    responsible_boxes_mask = np.zeros((traversal.tree.nboxes,), dtype=np.int8)
-    responsible_boxes_mask[responsible_boxes_list] = 1
-    responsible_boxes_mask = cl.array.to_device(queue, responsible_boxes_mask)
+    responsible_boxes_mask = actx.zeros((traversal.tree.nboxes,), dtype=np.int8)
+    responsible_boxes_mask[responsible_boxes_list] = (
+        1 + actx.zeros(responsible_boxes_list.shape, np.int8))
 
     ancestor_boxes_mask = get_ancestor_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask)
+        actx, code, traversal, responsible_boxes_mask)
 
     point_src_boxes_mask = get_point_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+        actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
 
     multipole_src_boxes_mask = get_multipole_src_boxes_mask(
-        queue, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
+        actx, code, traversal, responsible_boxes_mask, ancestor_boxes_mask)
 
     return BoxMasks(
-        responsible_boxes_mask, ancestor_boxes_mask, point_src_boxes_mask,
+        responsible_boxes_mask,
+        ancestor_boxes_mask,
+        point_src_boxes_mask,
         multipole_src_boxes_mask)
diff --git a/boxtree/fmm.py b/boxtree/fmm.py
index 3c4da9a1..6c13290c 100644
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
@@ -341,7 +341,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     :arg expansion_wrangler: An object exhibiting the
         :class:`ExpansionWranglerInterface`. For distributed implementation, this
         wrangler should be a subclass of
-        :class:`boxtree.distributed.calculation.DistributedExpansionWrangler`.
+        :class:`boxtree.distributed.calculation.DistributedExpansionWranglerMixin`.
     :arg src_weight_vecs: A sequence of source 'density/weights/charges'.
         Passed unmodified to *expansion_wrangler*. For distributed
         implementation, this argument is only significant on the root rank, but

From c1a9c7fdf03dcdf4991dac11c6021bbf789308c5 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 22 Jun 2022 16:07:10 +0300
Subject: [PATCH 15/28] add tests for DeviceDataRecord based on arraycontext

---
 boxtree/array_context.py | 161 ++++++++++++++++++++++++++++++++++++---
 test/test_tools.py       |  54 +++++++++++--
 2 files changed, 201 insertions(+), 14 deletions(-)

diff --git a/boxtree/array_context.py b/boxtree/array_context.py
index 74e60eda..5fe85c5c 100644
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
@@ -20,25 +20,28 @@
 THE SOFTWARE.
 """
 
-from arraycontext import PyOpenCLArrayContext as PyOpenCLArrayContextBase
+import numpy as np
+from arraycontext import (  # noqa: F401
+    PyOpenCLArrayContext as PyOpenCLArrayContextBase,
+    deserialize_container,
+    rec_map_array_container,
+    serialize_container,
+    with_array_context,
+)
 from arraycontext.pytest import (
     _PytestPyOpenCLArrayContextFactoryWithClass,
     register_pytest_array_context_factory,
 )
 
+from pyopencl.algorithm import BuiltList
+
 
 __doc__ = """
 .. autoclass:: PyOpenCLArrayContext
 """
 
 
-def _acf():
-    import pyopencl as cl
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-
-    return PyOpenCLArrayContext(queue, force_device_scalars=True)
-
+# {{{ array context
 
 class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
     def transform_loopy_program(self, t_unit):
@@ -51,7 +54,143 @@ def transform_loopy_program(self, t_unit):
                     "Did you use arraycontext.make_loopy_program "
                     "to create this kernel?")
 
-        return super().transform_loopy_program(t_unit)
+        return t_unit
+
+    # NOTE: _rec_map_container is copied from arraycontext wholesale and should
+    # be kept in sync as much as possible!
+
+    def _rec_map_container(self, func, array, allowed_types=None, *,
+            default_scalar=None, strict=False):
+        import arraycontext.impl.pyopencl.taggable_cl_array as tga
+
+        if allowed_types is None:
+            allowed_types = (tga.TaggableCLArray,)
+
+        def _wrapper(ary):
+            # NOTE: this is copied verbatim from arraycontext and this is the
+            # only change to allow optional fields inside containers
+            if ary is None:
+                return ary
+
+            if isinstance(ary, allowed_types):
+                return func(ary)
+            elif not strict and isinstance(ary, self.array_types):
+                from warnings import warn
+                warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with "
+                    f"{type(ary).__name__} will be unsupported in 2025. Use "
+                    "'to_tagged_cl_array' to convert instances to TaggableCLArray.",
+                    DeprecationWarning, stacklevel=2)
+                return func(tga.to_tagged_cl_array(ary))
+            elif np.isscalar(ary):
+                if default_scalar is None:
+                    return ary
+                else:
+                    return np.array(ary).dtype.type(default_scalar)
+            else:
+                raise TypeError(
+                    f"{type(self).__name__}.{func.__name__[1:]} invoked with "
+                    f"an unsupported array type: got '{type(ary).__name__}', "
+                    f"but expected one of {allowed_types}")
+
+        return rec_map_array_container(_wrapper, array)
+
+# }}}
+
+
+# {{{ dataclass array container
+
+def dataclass_array_container(cls: type) -> type:
+    """A decorator based on :func:`arraycontext.dataclass_array_container`
+    that allows :class:`typing.Optional` containers.
+    """
+
+    from dataclasses import Field, fields, is_dataclass
+    from types import UnionType
+    from typing import Union, get_args, get_origin
+
+    from arraycontext.container.dataclass import (
+        _inject_dataclass_serialization,
+        is_array_type,
+    )
+
+    assert is_dataclass(cls)
+
+    def is_array_field(f: Field) -> bool:
+        origin = get_origin(f.type)
+        if origin in (Union, UnionType):
+            return all(
+                (is_array_type(arg) or arg is type(None))
+                for arg in get_args(f.type))
+
+        if isinstance(f.type, str):
+            raise TypeError(
+                f"String annotation on field '{f.name}' not supported. "
+                "(this may be due to 'from __future__ import annotations')")
+
+        if not f.init:
+            raise ValueError(
+                    f"Field with 'init=False' not allowed: '{f.name}'")
+
+        # NOTE:
+        # * GenericAlias catches `list`, `tuple`, etc.
+        # * `_BaseGenericAlias` catches `List`, `Tuple`, `Callable`, etc.
+        # * `_SpecialForm` catches `Any`, `Literal`, `Optional`, etc.
+        from types import GenericAlias
+        from typing import (  # type: ignore[attr-defined]
+            _BaseGenericAlias,
+            _SpecialForm,
+        )
+        if isinstance(f.type, GenericAlias | _BaseGenericAlias | _SpecialForm):
+            # NOTE: anything except a Union is not an array
+            return False
+
+        return is_array_type(f.type)
+
+    from pytools import partition
+    array_fields, non_array_fields = partition(is_array_field, fields(cls))
+
+    if not array_fields:
+        raise ValueError(f"'{cls}' must have fields with array container type "
+                "in order to use the 'dataclass_array_container' decorator")
+
+    return _inject_dataclass_serialization(cls, array_fields, non_array_fields)
+
+# }}}
+
+
+# {{{ serialization
+
+# NOTE: BuiltList is serialized explicitly here because pyopencl cannot depend
+# on arraycontext machinery.
+
+@serialize_container.register(BuiltList)
+def _serialize_built_list(obj: BuiltList):
+    return (
+        ("starts", obj.starts),
+        ("lists", obj.lists),
+        ("nonempty_indices", obj.nonempty_indices),
+        ("compressed_indices", obj.compressed_indices),
+        )
+
+
+@deserialize_container.register(BuiltList)
+def _deserialize_built_list(template: BuiltList, iterable):
+    return type(template)(
+        count=template.count,
+        num_nonempty_lists=template.num_nonempty_lists,
+        **dict(iterable))
+
+# }}}
+
+
+# {{{ pytest
+
+def _acf():
+    import pyopencl as cl
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    return PyOpenCLArrayContext(queue, force_device_scalars=True)
 
 
 class PytestPyOpenCLArrayContextFactory(
@@ -61,3 +200,7 @@ class PytestPyOpenCLArrayContextFactory(
 
 register_pytest_array_context_factory("boxtree.pyopencl",
         PytestPyOpenCLArrayContextFactory)
+
+# }}}
+
+# vim: fdm=marker
diff --git a/test/test_tools.py b/test/test_tools.py
index e75af7f3..16b65307 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -98,7 +98,7 @@ def test_masked_matrix_compression(actx_factory, order):
     actx = actx_factory()
 
     from boxtree.tools import MaskCompressorKernel
-    matcompr = MaskCompressorKernel(actx.context)
+    matcompr = MaskCompressorKernel(actx)
 
     n = 40
     m = 10
@@ -107,7 +107,7 @@ def test_masked_matrix_compression(actx_factory, order):
     arr = (rng.random((n, m)) > 0.5).astype(np.int8).copy(order=order)
     d_arr = actx.from_numpy(arr)
 
-    arr_starts, arr_lists, _evt = matcompr(actx.queue, d_arr)
+    arr_starts, arr_lists, _evt = matcompr(actx, d_arr)
     arr_starts = actx.to_numpy(arr_starts)
     arr_lists = actx.to_numpy(arr_lists)
 
@@ -125,14 +125,14 @@ def test_masked_list_compression(actx_factory):
     rng = np.random.default_rng(seed=42)
 
     from boxtree.tools import MaskCompressorKernel
-    listcompr = MaskCompressorKernel(actx.context)
+    listcompr = MaskCompressorKernel(actx)
 
     n = 20
 
     arr = (rng.random(n) > 0.5).astype(np.int8)
     d_arr = actx.from_numpy(arr)
 
-    arr_list, _evt = listcompr(actx.queue, d_arr)
+    arr_list, _evt = listcompr(actx, d_arr)
     arr_list = actx.to_numpy(arr_list)
 
     assert set(arr_list) == set(arr.nonzero()[0])
@@ -165,6 +165,50 @@ def test_device_record(actx_factory):
     for i in range(3):
         assert np.array_equal(record_host.obj_array[i], record.obj_array[i])
 
+
+def test_device_record_array_context(actx_factory):
+    actx = actx_factory()
+
+    from dataclasses import dataclass
+
+    from arraycontext import Array
+
+    from boxtree.array_context import dataclass_array_container
+
+    @dataclass_array_container
+    @dataclass(frozen=True)
+    class MyDeviceDataRecord:
+        array: Array
+        obj_array: np.ndarray
+        opt_array: Array | None
+        value: float
+
+    from pytools.obj_array import make_obj_array
+    rng = np.random.default_rng()
+    record = MyDeviceDataRecord(
+        array=rng.random(128),
+        obj_array=make_obj_array([rng.random(128) for _ in range(3)]),
+        opt_array=None,
+        value=3)
+
+    actx_record = actx.from_numpy(record)
+    assert actx_record.array.queue is actx.queue
+
+    frozen_record = actx.freeze(actx_record)
+    assert frozen_record.array.queue is None
+
+    thawed_record = actx.thaw(frozen_record)
+    assert actx_record.array.queue is actx.queue
+
+    host_record = actx.to_numpy(thawed_record)
+    assert isinstance(host_record.array, np.ndarray)
+
+    assert record.value == host_record.value
+    assert np.allclose(record.array, host_record.array)
+    assert np.all([
+        np.allclose(record.obj_array[i], host_record.obj_array[i]) for i in range(3)
+        ])
+
 # }}}
 
 
@@ -176,7 +220,7 @@ def test_device_record(actx_factory):
 def test_particle_array(actx_factory, array_factory, dim, dtype):
     actx = actx_factory()
 
-    particles = array_factory(actx.queue, 1000, dim, dtype)
+    particles = array_factory(actx, 1000, dim, dtype)
     assert len(particles) == dim
     assert all(len(particles[0]) == len(axis) for axis in particles)
 

From cdee1ac43b438e623f4baf07f9069cdf42395c78 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 22 Jun 2022 16:13:02 +0300
Subject: [PATCH 16/28] port test_traversal to arraycontext

---
 test/test_traversal.py | 53 +++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/test/test_traversal.py b/test/test_traversal.py
index 3988f46a..6de51b36 100644
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@@ -54,22 +54,21 @@ def test_tree_connectivity(actx_factory, dims, sources_are_targets):
     actx = actx_factory()
     dtype = np.float64
 
-    sources = make_normal_particle_array(actx.queue, 1 * 10**5, dims, dtype)
+    sources = make_normal_particle_array(actx, 1 * 10**5, dims, dtype)
     if sources_are_targets:
         targets = None
     else:
-        targets = make_normal_particle_array(actx.queue, 2 * 10**5, dims, dtype)
+        targets = make_normal_particle_array(actx, 2 * 10**5, dims, dtype)
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-    tree, _ = tb(actx.queue, sources, max_particles_in_box=30,
-            targets=targets, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, sources, max_particles_in_box=30, targets=targets, debug=True)
 
     from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context)
-    trav, _ = tg(actx.queue, tree, debug=True)
-    tree = tree.get(queue=actx.queue)
-    trav = trav.get(queue=actx.queue)
+    tg = FMMTraversalBuilder(actx)
+    trav, _ = tg(actx, tree, debug=True)
+    tree = actx.to_numpy(tree)
+    trav = actx.to_numpy(trav)
 
     levels = tree.box_levels
     parents = tree.box_parent_ids.T
@@ -286,17 +285,15 @@ def test_plot_traversal(actx_factory, well_sep_is_n_away=1, visualize=False):
             for i in range(dims)])
 
         from boxtree import TreeBuilder
-        tb = TreeBuilder(actx.context)
-
-        actx.queue.finish()
-        tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+        tb = TreeBuilder(actx)
+        tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
         from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
-        trav, _ = tg(actx.queue, tree)
+        tg = FMMTraversalBuilder(actx, well_sep_is_n_away=well_sep_is_n_away)
+        trav, _ = tg(actx, tree)
 
-        tree = tree.get(queue=actx.queue)
-        trav = trav.get(queue=actx.queue)
+        tree = actx.to_numpy(tree)
+        trav = actx.to_numpy(trav)
 
         from boxtree.visualization import TreePlotter
         plotter = TreePlotter(tree)
@@ -340,10 +337,8 @@ def test_from_sep_siblings_translation_and_rotation_classes(
         for i in range(dims)])
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
     # }}}
 
@@ -353,14 +348,14 @@ def test_from_sep_siblings_translation_and_rotation_classes(
     from boxtree.translation_classes import TranslationClassesBuilder
     from boxtree.traversal import FMMTraversalBuilder
 
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=well_sep_is_n_away)
-    trav, _ = tg(actx.queue, tree)
+    tg = FMMTraversalBuilder(actx, well_sep_is_n_away=well_sep_is_n_away)
+    trav, _ = tg(actx, tree)
 
-    rb = RotationClassesBuilder(actx.context)
-    result, _ = rb(actx.queue, trav, tree)
+    rb = RotationClassesBuilder(actx)
+    result, _ = rb(actx, trav, tree)
 
-    tb = TranslationClassesBuilder(actx.context)
-    result_tb, _ = tb(actx.queue, trav, tree)
+    tb = TranslationClassesBuilder(actx)
+    result_tb, _ = tb(actx, trav, tree)
 
     rot_classes = actx.to_numpy(
             result.from_sep_siblings_rotation_classes)
@@ -372,8 +367,8 @@ def test_from_sep_siblings_translation_and_rotation_classes(
     distance_vectors = actx.to_numpy(
         result_tb.from_sep_siblings_translation_class_to_distance_vector)
 
-    tree = tree.get(queue=actx.queue)
-    trav = trav.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    trav = actx.to_numpy(trav)
 
     centers = tree.box_centers.T
 

From e112f175f0ef1456ae3e92bc02f76488a006500e Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 23 Jun 2022 16:25:04 +0300
Subject: [PATCH 17/28] port test_tree to arraycontext

---
 test/test_tree.py | 175 +++++++++++++++++++++-------------------------
 1 file changed, 78 insertions(+), 97 deletions(-)

diff --git a/test/test_tree.py b/test/test_tree.py
index 1579464c..aa6076f3 100644
--- a/test/test_tree.py
+++ b/test/test_tree.py
@@ -27,11 +27,8 @@
 import pytest
 from arraycontext import pytest_generate_tests_for_array_contexts
 
-from boxtree.array_context import (
-    PytestPyOpenCLArrayContextFactory,
-    _acf,  # noqa: F401
-)
-from boxtree.tools import make_normal_particle_array
+from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
+from boxtree.tools import AXIS_NAMES, make_normal_particle_array
 
 
 logger = logging.getLogger(__name__)
@@ -50,18 +47,17 @@ def test_bounding_box(actx_factory, dtype, dims, nparticles):
     actx = actx_factory()
 
     from boxtree.bounding_box import BoundingBoxFinder
-    from boxtree.tools import AXIS_NAMES
-    bbf = BoundingBoxFinder(actx.context)
+    bbf = BoundingBoxFinder(actx)
 
     axis_names = AXIS_NAMES[:dims]
     logger.info("%s - %s %s", dtype, dims, nparticles)
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     bbox_min = [np.min(actx.to_numpy(x)) for x in particles]
     bbox_max = [np.max(actx.to_numpy(x)) for x in particles]
 
-    bbox_cl, _evt = bbf(particles, radii=None)
+    bbox_cl, _evt = bbf(actx, particles, radii=None)
     bbox_cl = actx.to_numpy(bbox_cl)
 
     bbox_min_cl = np.empty(dims, dtype)
@@ -104,21 +100,19 @@ def run_build_test(builder, actx, dims, dtype, nparticles, visualize,
 
     logger.info(75 * "-")
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
         np_particles = actx.to_numpy(particles)
         pt.plot(np_particles[0], np_particles[1], "x")
 
-    actx.queue.finish()
-
-    tree, _ = builder(actx.queue, particles,
+    tree, _ = builder(actx, particles,
                       max_particles_in_box=max_particles_in_box,
                       refine_weights=refine_weights,
                       max_leaf_refine_weight=max_leaf_refine_weight,
                       debug=True, **kwargs)
-    tree = tree.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
 
     sorted_particles = np.array(list(tree.sources))
 
@@ -237,7 +231,7 @@ def test_single_box_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims,
             dtype, 4, max_particles_in_box=30, visualize=visualize)
@@ -248,7 +242,7 @@ def test_two_level_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims,
             dtype, 50, max_particles_in_box=30, visualize=visualize)
@@ -259,7 +253,7 @@ def test_unpruned_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     # test unpruned tree build
     run_build_test(builder, actx, dims, dtype, 10**5,
@@ -272,7 +266,7 @@ def test_particle_tree_with_reallocations(
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims, dtype, 10**5,
             max_particles_in_box=30, visualize=visualize, nboxes_guess=5)
@@ -284,7 +278,7 @@ def test_particle_tree_with_many_empty_leaves(
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims, dtype, 10**5,
             max_particles_in_box=5, visualize=visualize)
@@ -295,7 +289,7 @@ def test_vanilla_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims, dtype, 10**5,
             max_particles_in_box=30, visualize=visualize)
@@ -307,7 +301,7 @@ def test_explicit_refine_weights_particle_tree(
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     nparticles = 10**5
 
@@ -326,7 +320,7 @@ def test_non_adaptive_particle_tree(actx_factory, dtype, dims, visualize=False):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    builder = TreeBuilder(actx.context)
+    builder = TreeBuilder(actx)
 
     run_build_test(builder, actx, dims, dtype, 10**4,
             max_particles_in_box=30, visualize=visualize, kind="non-adaptive")
@@ -345,9 +339,9 @@ def test_source_target_tree(actx_factory, dims, visualize=False):
     ntargets = 3 * 10**5
     dtype = np.float64
 
-    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx, nsources, dims, dtype,
             seed=12)
-    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx, ntargets, dims, dtype,
             seed=19)
 
     if visualize:
@@ -358,12 +352,11 @@ def test_source_target_tree(actx_factory, dims, visualize=False):
         pt.show()
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=10, debug=True)
-    tree = tree.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
 
     sorted_sources = np.array(list(tree.sources))
     sorted_targets = np.array(list(tree.targets))
@@ -457,9 +450,9 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
     dtype = np.float64
     npoint_sources_per_source = 16
 
-    sources = make_normal_particle_array(actx.queue, nsources, dims, dtype,
+    sources = make_normal_particle_array(actx, nsources, dims, dtype,
             seed=12)
-    targets = make_normal_particle_array(actx.queue, ntargets, dims, dtype,
+    targets = make_normal_particle_array(actx, ntargets, dims, dtype,
             seed=19)
 
     refine_weights = actx.np.zeros(nsources + ntargets, np.int32)
@@ -474,10 +467,10 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
             )
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
     actx.queue.finish()
-    dev_tree, _ = tb(actx.queue, sources, targets=targets,
+    dev_tree, _ = tb(actx, sources, targets=targets,
             source_radii=source_radii,
             target_radii=target_radii,
             extent_norm=extent_norm,
@@ -495,7 +488,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
 
     logger.info("transfer tree, check orderings")
 
-    tree = dev_tree.get(queue=actx.queue)
+    tree = actx.to_numpy(dev_tree)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -657,7 +650,7 @@ def test_extent_tree(actx_factory, dims, extent_norm, visualize=False):
             )
 
     from boxtree.tree import link_point_sources
-    dev_tree = link_point_sources(actx.queue, dev_tree,
+    dev_tree = link_point_sources(actx, dev_tree,
             point_source_starts, point_sources,
             debug=True)
 
@@ -677,7 +670,7 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -685,23 +678,23 @@ def test_leaves_to_balls_query(actx_factory, dims, visualize=False):
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
+    tree = actx.thaw(tree)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
 
     from boxtree.area_query import LeavesToBallsLookupBuilder
-    lblb = LeavesToBallsLookupBuilder(actx.context)
+    lblb = LeavesToBallsLookupBuilder(actx)
 
-    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)
+    lbl, _ = lblb(actx, tree, ball_centers, ball_radii)
 
     # get data to host for test
-    tree = tree.get(queue=actx.queue)
-    lbl = lbl.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    lbl = actx.to_numpy(lbl)
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
     ball_radii = actx.to_numpy(ball_radii)
 
@@ -734,13 +727,12 @@ def run_area_query_test(actx, tree, ball_centers, ball_radii):
     Performs an area query and checks that the result is as expected.
     """
     from boxtree.area_query import AreaQueryBuilder
-    aqb = AreaQueryBuilder(actx.context)
-
-    area_query, _ = aqb(actx.queue, tree, ball_centers, ball_radii)
+    aqb = AreaQueryBuilder(actx)
+    area_query, _ = aqb(actx, tree, ball_centers, ball_radii)
 
     # Get data to host for test.
-    tree = tree.get(queue=actx.queue)
-    area_query = area_query.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    area_query = actx.to_numpy(area_query)
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers]).T
     ball_radii = actx.to_numpy(ball_radii)
 
@@ -781,7 +773,7 @@ def test_area_query(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -789,13 +781,11 @@ def test_area_query(actx_factory, dims, visualize=False):
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
 
     run_area_query_test(actx, tree, ball_centers, ball_radii)
@@ -814,7 +804,7 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False):
     nparticles = 10**4
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -822,10 +812,8 @@ def test_area_query_balls_outside_bbox(actx_factory, dims, visualize=False):
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
     bbox_min = tree.bounding_box[0].min()
@@ -851,7 +839,7 @@ def test_area_query_elwise(actx_factory, dims, visualize=False):
     nparticles = 10**5
     dtype = np.float64
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -859,13 +847,11 @@ def test_area_query_elwise(actx_factory, dims, visualize=False):
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
 
     from boxtree.area_query import AreaQueryElementwiseTemplate, PeerListFinder
@@ -885,10 +871,10 @@ def test_area_query_elwise(actx_factory, dims, visualize=False):
         """,
         leaf_found_op="")
 
-    peer_lists, evt = PeerListFinder(actx.context)(actx.queue, tree)
+    peer_lists, evt = PeerListFinder(actx)(actx, tree)
 
     kernel = template.generate(
-        actx.context,
+        actx.queue.context,
         dims,
         tree.coord_dtype,
         tree.box_id_dtype,
@@ -919,8 +905,7 @@ def test_level_restriction(
     dtype = np.float64
 
     from boxtree.tools import make_surface_particle_array
-    particles = make_surface_particle_array(
-            actx.queue, nparticles, dims, dtype, seed=15)
+    particles = make_surface_particle_array(actx, nparticles, dims, dtype, seed=15)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -928,10 +913,8 @@ def test_level_restriction(
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree_dev, _ = tb(actx.queue, particles,
+    tb = TreeBuilder(actx)
+    tree_dev, _ = tb(actx, particles,
             kind="adaptive-level-restricted",
             max_particles_in_box=30, debug=True,
             skip_prune=skip_prune, lr_lookbehind=lookbehind,
@@ -946,18 +929,18 @@ def find_neighbors(leaf_box_centers, leaf_box_radii):
         # Note that since this comes from an area query, the self box will be
         # included in the neighbor list.
         from boxtree.area_query import AreaQueryBuilder
-        aqb = AreaQueryBuilder(actx.context)
+        aqb = AreaQueryBuilder(actx)
 
         ball_radii = actx.from_numpy(np.min(leaf_box_radii) / 2 + leaf_box_radii)
         leaf_box_centers = [actx.from_numpy(axis) for axis in leaf_box_centers]
 
-        area_query, _ = aqb(actx.queue, tree_dev, leaf_box_centers, ball_radii)
-        area_query = area_query.get(queue=actx.queue)
+        area_query, _ = aqb(actx, tree_dev, leaf_box_centers, ball_radii)
+        area_query = actx.to_numpy(area_query)
         return (area_query.leaves_near_ball_starts,
                 area_query.leaves_near_ball_lists)
 
     # Get data to host for test.
-    tree = tree_dev.get(queue=actx.queue)
+    tree = actx.to_numpy(tree_dev)
 
     # Find leaf boxes.
     from boxtree import box_flags_enum
@@ -1001,7 +984,7 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
     dtype = np.dtype(dtype)
     nparticles = 10**5
 
-    particles = make_normal_particle_array(actx.queue, nparticles, dims, dtype)
+    particles = make_normal_particle_array(actx, nparticles, dims, dtype)
 
     if visualize:
         import matplotlib.pyplot as pt
@@ -1009,29 +992,27 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
         pt.plot(np_particles[0], np_particles[1], "x")
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
-
-    actx.queue.finish()
-    tree, _ = tb(actx.queue, particles, max_particles_in_box=30, debug=True)
+    tb = TreeBuilder(actx)
+    tree, _ = tb(actx, particles, max_particles_in_box=30, debug=True)
 
     nballs = 10**4
-    ball_centers = make_normal_particle_array(actx.queue, nballs, dims, dtype)
+    ball_centers = make_normal_particle_array(actx, nballs, dims, dtype)
     ball_radii = 0.1 + actx.np.zeros(nballs, dtype)
 
     from boxtree.area_query import LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder
 
-    siqb = SpaceInvaderQueryBuilder(actx.context)
+    siqb = SpaceInvaderQueryBuilder(actx)
     # We can use leaves-to-balls lookup to get the set of overlapping balls for
     # each box, and from there to compute the outer space invader distance.
-    lblb = LeavesToBallsLookupBuilder(actx.context)
+    lblb = LeavesToBallsLookupBuilder(actx)
 
-    siq, _ = siqb(actx.queue, tree, ball_centers, ball_radii)
-    lbl, _ = lblb(actx.queue, tree, ball_centers, ball_radii)
+    siq, _ = siqb(actx, tree, ball_centers, ball_radii)
+    lbl, _ = lblb(actx, tree, ball_centers, ball_radii)
 
     # get data to host for test
-    tree = tree.get(queue=actx.queue)
-    siq = siq.get(queue=actx.queue)
-    lbl = lbl.get(queue=actx.queue)
+    tree = actx.to_numpy(tree)
+    siq = actx.to_numpy(siq)
+    lbl = actx.to_numpy(lbl)
 
     ball_centers = np.array([actx.to_numpy(x) for x in ball_centers])
     ball_radii = actx.to_numpy(ball_radii)
@@ -1062,7 +1043,7 @@ def test_space_invader_query(actx_factory, dims, dtype, visualize=False):
 
 @pytest.mark.opencl
 @pytest.mark.parametrize("dims", [2, 3])
-def test_same_tree_with_zero_weight_particles(actx_factory, dims):
+def test_same_tree_with_zero_weight_particles(actx_factory, dims, visualize=False):
     actx = actx_factory()
 
     ntargets_values = [300, 400, 500]
@@ -1070,7 +1051,7 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims):
     nsources = 20
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
     trees = []
 
@@ -1091,18 +1072,18 @@ def test_same_tree_with_zero_weight_particles(actx_factory, dims):
             refine_weights[:nsources] = 1
             refine_weights[nsources:] = 0
 
-            tree, _ = tb(actx.queue, sources, targets=targets,
+            tree, _ = tb(actx, sources, targets=targets,
                     target_radii=target_radii,
                     stick_out_factor=stick_out_factor,
                     max_leaf_refine_weight=10,
                     refine_weights=refine_weights,
                     debug=True)
-            tree = tree.get(queue=actx.queue)
+            tree = actx.to_numpy(tree)
             trees.append(tree)
 
             print("TREE:", tree.nboxes)
 
-    if 0:
+    if visualize:
         import matplotlib.pyplot as plt
         for tree in trees:
             plt.figure()
@@ -1119,12 +1100,12 @@ def test_max_levels_error(actx_factory):
     actx = actx_factory()
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
     sources = [actx.np.zeros(11, np.float64) for i in range(2)]
     from boxtree.tree_build import MaxLevelsExceeded
     with pytest.raises(MaxLevelsExceeded):
-        _tree, _ = tb(actx.queue, sources, max_particles_in_box=10, debug=True)
+        _tree, _ = tb(actx, sources, max_particles_in_box=10, debug=True)
 
 # }}}
 

From 2b0bd5fa7c5b7b62f07913c747298e1e44d7ecde Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 23 Jun 2022 21:58:11 +0300
Subject: [PATCH 18/28] port test_fmm to arraycontext

---
 test/test_fmm.py | 114 ++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/test/test_fmm.py b/test/test_fmm.py
index 615e7a1c..659ae64e 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -178,7 +178,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
     dtype = np.float64
 
     try:
-        sources = source_gen(actx.queue, nsources_req, dims, dtype, seed=15)
+        sources = source_gen(actx, nsources_req, dims, dtype, seed=15)
         nsources = len(sources[0])
 
         if ntargets_req is None:
@@ -186,7 +186,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
             targets = None
             ntargets = ntargets_req
         else:
-            targets = target_gen(actx.queue, ntargets_req, dims, dtype, seed=16)
+            targets = target_gen(actx, ntargets_req, dims, dtype, seed=16)
             ntargets = len(targets[0])
     except ImportError:
         pytest.skip("loopy not available, but needed for particle array "
@@ -208,40 +208,40 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
         target_radii = None
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=30,
             source_radii=source_radii, target_radii=target_radii,
             debug=True, stick_out_factor=0.25, extent_norm=extent_norm)
     if 0:
-        tree = tree.get(queue=actx.queue)
+        tree = actx.to_numpy(tree)
         tree.plot()
         import matplotlib.pyplot as pt
         pt.show()
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context,
+    tbuild = FMMTraversalBuilder(actx,
             well_sep_is_n_away=well_sep_is_n_away,
             from_sep_smaller_crit=from_sep_smaller_crit)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    trav, _ = tbuild(actx, tree, debug=True)
 
     if who_has_extent:
         pre_merge_trav = trav
-        trav = trav.merge_close_lists(actx.queue)
+        trav = trav.merge_close_lists(actx)
 
     # weights = np.random.randn(nsources)
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
     host_tree = host_trav.tree
 
     if who_has_extent:
-        pre_merge_host_trav = pre_merge_trav.get(queue=actx.queue)
+        pre_merge_host_trav = actx.to_numpy(pre_merge_trav)
 
     from boxtree.tree import ParticleListFilter
-    plfilt = ParticleListFilter(actx.context)
+    plfilt = ParticleListFilter(actx)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
 
@@ -252,16 +252,16 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
 
         if filter_kind == "user":
             filtered_targets = plfilt.filter_target_lists_in_user_order(
-                    actx.queue, tree, flags)
+                    actx, tree, flags)
             wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInUserOrder(
                     tree_indep, host_trav,
-                    filtered_targets.get(queue=actx.queue))
+                    actx.to_numpy(filtered_targets))
         elif filter_kind == "tree":
             filtered_targets = plfilt.filter_target_lists_in_tree_order(
-                    actx.queue, tree, flags)
+                    actx, tree, flags)
             wrangler = ConstantOneExpansionWranglerWithFilteredTargetsInTreeOrder(
                     tree_indep, host_trav,
-                    filtered_targets.get(queue=actx.queue))
+                    actx.to_numpy(filtered_targets))
         else:
             raise ValueError("unsupported value of 'filter_kind'")
     else:
@@ -402,25 +402,25 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k):
     ntargets = 1000
     dtype = np.float64
 
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
     targets = (
-            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            p_normal(actx, ntargets, dims, dtype, seed=18)
             + np.array([2, 0, 0])[:dims])
 
-    sources_host = particle_array_to_host(sources)
-    targets_host = particle_array_to_host(targets)
+    sources_host = particle_array_to_host(actx, sources)
+    targets_host = particle_array_to_host(actx, targets)
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=30, debug=True)
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx)
+    trav, _ = tbuild(actx, tree, debug=True)
 
-    trav = trav.get(queue=actx.queue)
+    trav = actx.to_numpy(trav)
 
     rng = np.random.default_rng(20)
     weights = rng.uniform(0.0, 1.0, (nsources,))
@@ -511,9 +511,13 @@ def fmm_level_to_order(tree, lev):
                 [knl],
                 exclude_self=False)
 
-        _evt, (sumpy_ref_pot,) = p2p(
-                actx.queue, targets, sources, (weights,),
-                out_host=True, **sumpy_extra_kwargs)
+        result, = p2p(
+                actx,
+                targets,
+                sources,
+                (actx.from_numpy(weights),),
+                **sumpy_extra_kwargs)
+        sumpy_ref_pot = actx.to_numpy(result)
 
         sumpy_rel_err = (
                 la.norm(pot - sumpy_ref_pot, np.inf)
@@ -554,18 +558,18 @@ def test_pyfmmlib_numerical_stability(actx_factory, dims, helmholtz_k, order):
     targets = sources * (1 + 1e-3)
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=2, debug=True)
 
     assert tree.nlevels >= 15
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx)
+    trav, _ = tbuild(actx, tree, debug=True)
 
-    trav = trav.get(queue=actx.queue)
+    trav = actx.to_numpy(trav)
     weights = np.ones_like(sources[0])
 
     from boxtree.pyfmmlib_integration import (
@@ -585,7 +589,7 @@ def fmm_level_to_order(tree, lev):
             tree_indep, trav,
             helmholtz_k=helmholtz_k,
             fmm_level_to_order=fmm_level_to_order,
-            rotation_data=FMMLibRotationData(actx.queue, trav))
+            rotation_data=FMMLibRotationData(actx, trav))
 
     from boxtree.fmm import drive_fmm
     pot = drive_fmm(wrangler, (weights,))
@@ -625,8 +629,8 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten
     from_sep_smaller_min_nsources_cumul = 1 + max_particles_in_box
 
     from boxtree.fmm import drive_fmm
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=15)
 
     rng = np.random.default_rng(22)
     if enable_extents:
@@ -637,22 +641,22 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten
         target_radii = None
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=max_particles_in_box,
             target_radii=target_radii,
             debug=True, stick_out_factor=0.25)
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True,
+    tbuild = FMMTraversalBuilder(actx)
+    trav, _ = tbuild(actx, tree, debug=True,
             _from_sep_smaller_min_nsources_cumul=from_sep_smaller_min_nsources_cumul)
 
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
@@ -680,8 +684,8 @@ def test_fmm_float32(actx_factory, enable_extents):
     dtype = np.float32
 
     from boxtree.fmm import drive_fmm
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=15)
 
     rng = np.random.default_rng(12)
     if enable_extents:
@@ -692,21 +696,21 @@ def test_fmm_float32(actx_factory, enable_extents):
         target_radii = None
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=30,
             target_radii=target_radii,
             debug=True, stick_out_factor=0.25)
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
+    tbuild = FMMTraversalBuilder(actx)
+    trav, _ = tbuild(actx, tree, debug=True)
 
     weights = np.ones(nsources)
     weights_sum = np.sum(weights)
 
-    host_trav = trav.get(queue=actx.queue)
+    host_trav = actx.to_numpy(trav)
 
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
@@ -732,21 +736,21 @@ def test_fmm_with_optimized_3d_m2l(actx_factory, nsrcntgts, helmholtz_k,
     nsources = ntargets = nsrcntgts // 2
     dtype = np.float64
 
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
     targets = (
-            p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            p_normal(actx, ntargets, dims, dtype, seed=18)
             + np.array([2, 0, 0])[:dims])
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
 
-    tree, _ = tb(actx.queue, sources, targets=targets,
+    tree, _ = tb(actx, sources, targets=targets,
             max_particles_in_box=30, debug=True)
 
     from boxtree.traversal import FMMTraversalBuilder
-    tbuild = FMMTraversalBuilder(actx.context)
-    trav, _ = tbuild(actx.queue, tree, debug=True)
-    trav = trav.get(queue=actx.queue)
+    tbuild = FMMTraversalBuilder(actx)
+    trav, _ = tbuild(actx, tree, debug=True)
+    trav = actx.to_numpy(trav)
 
     rng = np.random.default_rng(20)
     weights = rng.uniform(0.0, 1.0, (nsources,))
@@ -781,7 +785,7 @@ def fmm_level_to_order(tree, lev):
             tree_indep, trav,
             helmholtz_k=helmholtz_k,
             fmm_level_to_order=fmm_level_to_order,
-            rotation_data=FMMLibRotationData(actx.queue, trav))
+            rotation_data=FMMLibRotationData(actx, trav))
 
     from boxtree.fmm import drive_fmm
 

From 994a3596e2078a161f0e335ba9943d8a92200304 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Thu, 23 Jun 2022 21:58:27 +0300
Subject: [PATCH 19/28] port test_cost_model to arraycontext

---
 test/test_cost_model.py | 101 ++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index cd1fb95b..7c382a1a 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -32,10 +32,7 @@
 import pytest
 from arraycontext import pytest_generate_tests_for_array_contexts
 
-from boxtree.array_context import (
-    PytestPyOpenCLArrayContextFactory,
-    _acf,  # noqa: F401
-)
+from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
 from boxtree.cost import (
     FMMCostModel,
     _PythonFMMCostModel,
@@ -64,8 +61,8 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     # {{{ Generate sources, targets and target_radii
 
     from boxtree.tools import make_normal_particle_array as p_normal
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+    sources = p_normal(actx, nsources, dims, dtype, seed=15)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
     rng = np.random.default_rng(22)
     target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
@@ -75,16 +72,16 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     # {{{ Generate tree and traversal
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
     tree, _ = tb(
-        actx.queue, sources, targets=targets, target_radii=target_radii,
+        actx, sources, targets=targets, target_radii=target_radii,
         stick_out_factor=0.15, max_particles_in_box=30, debug=True
     )
 
     from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-    trav_dev, _ = tg(actx.queue, tree, debug=True)
-    trav = trav_dev.get(queue=actx.queue)
+    tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2)
+    trav_dev, _ = tg(actx, tree, debug=True)
+    trav = actx.to_numpy(trav_dev)
 
     # }}}
 
@@ -112,12 +109,12 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
             context=constant_one_params
         )
     p2m_cost_dev = actx.from_numpy(p2m_cost)
-
     actx.queue.finish()
+
     start_time = time.time()
 
     cl_form_multipoles = cl_cost_model.process_form_multipoles(
-        actx.queue, trav_dev, p2m_cost_dev
+        actx, trav_dev, p2m_cost_dev
     )
 
     actx.queue.finish()
@@ -127,7 +124,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_form_multipoles = python_cost_model.process_form_multipoles(
-        actx.queue, trav, p2m_cost
+        actx, trav, p2m_cost
     )
 
     logger.info("Python time for process_form_multipoles: %gs",
@@ -150,7 +147,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
     cl_coarsen_multipoles = cl_cost_model.process_coarsen_multipoles(
-        actx.queue, trav_dev, m2m_cost_dev
+        actx, trav_dev, m2m_cost_dev
     )
 
     actx.queue.finish()
@@ -160,7 +157,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_coarsen_multipoles = python_cost_model.process_coarsen_multipoles(
-        actx.queue, trav, m2m_cost
+        actx, trav, m2m_cost
     )
 
     logger.info("Python time for coarsen_multipoles: %gs",
@@ -176,10 +173,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_ndirect_sources_per_target_box = \
-        cl_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav_dev)
+        cl_cost_model.get_ndirect_sources_per_target_box(actx, trav_dev)
 
     cl_direct = cl_cost_model.process_direct(
-        actx.queue, trav_dev, cl_ndirect_sources_per_target_box, 5.0
+        actx, trav_dev, cl_ndirect_sources_per_target_box, 5.0
     )
 
     actx.queue.finish()
@@ -189,10 +186,10 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     python_ndirect_sources_per_target_box = \
-        python_cost_model.get_ndirect_sources_per_target_box(actx.queue, trav)
+        python_cost_model.get_ndirect_sources_per_target_box(actx, trav)
 
     python_direct = python_cost_model.process_direct(
-        actx.queue, trav, python_ndirect_sources_per_target_box, 5.0
+        actx, trav, python_ndirect_sources_per_target_box, 5.0
     )
 
     logger.info("Python time for process_direct: %gs",
@@ -206,7 +203,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
 
-    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(cl_direct)
+    cl_direct_aggregate = cl_cost_model.aggregate_over_boxes(actx, cl_direct)
 
     actx.queue.finish()
     logger.info("OpenCL time for aggregate_over_boxes: %gs",
@@ -214,7 +211,9 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
 
-    python_direct_aggregate = python_cost_model.aggregate_over_boxes(python_direct)
+    python_direct_aggregate = (
+        python_cost_model.aggregate_over_boxes(actx, python_direct)
+    )
 
     logger.info("Python time for aggregate_over_boxes: %gs",
             time.time() - start_time)
@@ -237,14 +236,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_m2l_cost = cl_cost_model.process_list2(actx.queue, trav_dev, m2l_cost_dev)
+    cl_m2l_cost = cl_cost_model.process_list2(actx, trav_dev, m2l_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list2: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_m2l_cost = python_cost_model.process_list2(actx.queue, trav, m2l_cost)
+    python_m2l_cost = python_cost_model.process_list2(actx, trav, m2l_cost)
     logger.info("Python time for process_list2: %gs",
             time.time() - start_time)
 
@@ -265,14 +264,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_m2p_cost = cl_cost_model.process_list3(actx.queue, trav_dev, m2p_cost_dev)
+    cl_m2p_cost = cl_cost_model.process_list3(actx, trav_dev, m2p_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list3: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_m2p_cost = python_cost_model.process_list3(actx.queue, trav, m2p_cost)
+    python_m2p_cost = python_cost_model.process_list3(actx, trav, m2p_cost)
     logger.info("Python time for process_list3: %gs",
             time.time() - start_time)
 
@@ -293,14 +292,14 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     actx.queue.finish()
     start_time = time.time()
 
-    cl_p2l_cost = cl_cost_model.process_list4(actx.queue, trav_dev, p2l_cost_dev)
+    cl_p2l_cost = cl_cost_model.process_list4(actx, trav_dev, p2l_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_list4: %gs",
             time.time() - start_time)
 
     start_time = time.time()
-    python_p2l_cost = python_cost_model.process_list4(actx.queue, trav, p2l_cost)
+    python_p2l_cost = python_cost_model.process_list4(actx, trav, p2l_cost)
     logger.info("Python time for process_list4: %gs",
             time.time() - start_time)
 
@@ -322,7 +321,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_refine_locals_cost = cl_cost_model.process_refine_locals(
-        actx.queue, trav_dev, l2l_cost_dev
+        actx, trav_dev, l2l_cost_dev
     )
 
     actx.queue.finish()
@@ -331,7 +330,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
     python_refine_locals_cost = python_cost_model.process_refine_locals(
-        actx.queue, trav, l2l_cost
+        actx, trav, l2l_cost
     )
     logger.info("Python time for refine_locals: %gs",
             time.time() - start_time)
@@ -354,7 +353,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
     start_time = time.time()
 
     cl_l2p_cost = cl_cost_model.process_eval_locals(
-            actx.queue, trav_dev, l2p_cost_dev)
+            actx, trav_dev, l2p_cost_dev)
 
     actx.queue.finish()
     logger.info("OpenCL time for process_eval_locals: %gs",
@@ -362,7 +361,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 
     start_time = time.time()
     python_l2p_cost = python_cost_model.process_eval_locals(
-            actx.queue, trav, l2p_cost)
+            actx, trav, l2p_cost)
     logger.info("Python time for process_eval_locals: %gs",
             time.time() - start_time)
 
@@ -404,8 +403,8 @@ def fmm_level_to_order(tree, ilevel):
         # {{{ Generate sources, targets and target_radii
 
         from boxtree.tools import make_normal_particle_array as p_normal
-        sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-        targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+        sources = p_normal(actx, nsources, dims, dtype, seed=15)
+        targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
         rng = np.random.default_rng(22)
         target_radii = rng.uniform(0.0, 0.05, (ntargets,)).astype(dtype)
@@ -415,16 +414,16 @@ def fmm_level_to_order(tree, ilevel):
         # {{{ Generate tree and traversal
 
         from boxtree import TreeBuilder
-        tb = TreeBuilder(actx.context)
+        tb = TreeBuilder(actx)
         tree, _ = tb(
-            actx.queue, sources, targets=targets, target_radii=target_radii,
+            actx, sources, targets=targets, target_radii=target_radii,
             stick_out_factor=0.15, max_particles_in_box=30, debug=True
         )
 
         from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-        trav_dev, _ = tg(actx.queue, tree, debug=True)
-        trav = trav_dev.get(queue=actx.queue)
+        tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2)
+        trav_dev, _ = tg(actx, tree, debug=True)
+        trav = actx.to_numpy(trav_dev)
 
         traversals.append(trav)
         traversals_dev.append(trav_dev)
@@ -467,7 +466,7 @@ def test_params_equal(test_params1, test_params2):
         level_to_order = level_to_orders[icase]
 
         python_model_results.append(python_cost_model.cost_per_stage(
-            actx.queue, traversal, level_to_order,
+            actx, traversal, level_to_order,
             _PythonFMMCostModel.get_unit_calibration_params(),
         ))
 
@@ -486,7 +485,7 @@ def test_params_equal(test_params1, test_params2):
         level_to_order = level_to_orders[icase]
 
         cl_model_results.append(cl_cost_model.cost_per_stage(
-            actx.queue, traversal, level_to_order,
+            actx, traversal, level_to_order,
             FMMCostModel.get_unit_calibration_params(),
         ))
 
@@ -539,23 +538,23 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     actx = actx_factory()
 
     from boxtree.tools import make_normal_particle_array as p_normal
-    sources = p_normal(actx.queue, nsources, dims, dtype, seed=16)
-    targets = p_normal(actx.queue, ntargets, dims, dtype, seed=19)
+    sources = p_normal(actx, nsources, dims, dtype, seed=16)
+    targets = p_normal(actx, ntargets, dims, dtype, seed=19)
 
     rng = np.random.default_rng(20)
     target_radii = rng.uniform(0, 0.04, (ntargets,)).astype(dtype)
 
     from boxtree import TreeBuilder
-    tb = TreeBuilder(actx.context)
+    tb = TreeBuilder(actx)
     tree, _ = tb(
-        actx.queue, sources, targets=targets, target_radii=target_radii,
+        actx, sources, targets=targets, target_radii=target_radii,
         stick_out_factor=0.15, max_particles_in_box=30, debug=True
     )
 
     from boxtree.traversal import FMMTraversalBuilder
-    tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
-    trav_dev, _ = tg(actx.queue, tree, debug=True)
-    trav = trav_dev.get(queue=actx.queue)
+    tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2)
+    trav_dev, _ = tg(actx, tree, debug=True)
+    trav = actx.to_numpy(trav_dev)
 
     from boxtree.constant_one import (
         ConstantOneExpansionWrangler,
@@ -576,7 +575,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     level_to_order = np.array([1 for _ in range(tree.nlevels)])
 
     modeled_time = cost_model.cost_per_stage(
-        actx.queue, trav_dev, level_to_order,
+        actx, trav_dev, level_to_order,
         FMMCostModel.get_unit_calibration_params(),
     )
 
@@ -595,10 +594,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
         total_cost += timing_data[stage]["ops_elapsed"]
 
     per_box_cost = cost_model.cost_per_box(
-        actx.queue, trav_dev, level_to_order,
+        actx, trav_dev, level_to_order,
         FMMCostModel.get_unit_calibration_params(),
     )
-    total_aggregate_cost = cost_model.aggregate_over_boxes(per_box_cost)
+    total_aggregate_cost = cost_model.aggregate_over_boxes(actx, per_box_cost)
 
     assert total_cost == (
             total_aggregate_cost

From 53e7c28b9c3a0c5299ae373afda3a0a1f1e6753c Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sun, 26 Jun 2022 20:00:45 +0300
Subject: [PATCH 20/28] port test_distributed to arraycontext

---
 test/test_distributed.py | 70 ++++++++++++++++++++++++----------------
 1 file changed, 42 insertions(+), 28 deletions(-)

diff --git a/test/test_distributed.py b/test/test_distributed.py
index ce975926..41e5ac6e 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -39,6 +39,7 @@
 )
 from boxtree.pyfmmlib_integration import (
     FMMLibExpansionWrangler,
+    FMMLibRotationData,
     FMMLibTreeIndependentDataForWrangler,
     Kernel,
 )
@@ -84,7 +85,7 @@ def fmm_level_to_order(tree, level):
         actx = _acf()
 
         from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context, well_sep_is_n_away=2)
+        tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2)
 
         tree_indep = FMMLibTreeIndependentDataForWrangler(
             dims, Kernel.HELMHOLTZ if helmholtz_k else Kernel.LAPLACE)
@@ -93,8 +94,8 @@ def fmm_level_to_order(tree, level):
         if rank == 0:
             # Generate random particles and source weights
             from boxtree.tools import make_normal_particle_array as p_normal
-            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-            targets = p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            sources = p_normal(actx, nsources, dims, dtype, seed=15)
+            targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
             rng = np.random.default_rng(20)
             sources_weights = rng.uniform(0.0, 1.0, (nsources,))
@@ -102,19 +103,20 @@ def fmm_level_to_order(tree, level):
 
             # Build the tree and interaction lists
             from boxtree import TreeBuilder
-            tb = TreeBuilder(actx.context)
+            tb = TreeBuilder(actx)
             global_tree_dev, _ = tb(
-                actx.queue, sources, targets=targets, target_radii=target_radii,
+                actx, sources, targets=targets, target_radii=target_radii,
                 stick_out_factor=0.25, max_particles_in_box=30, debug=True)
 
-            d_trav, _ = tg(actx.queue, global_tree_dev, debug=True)
-            global_traversal_host = d_trav.get(queue=actx.queue)
+            d_trav, _ = tg(actx, global_tree_dev, debug=True)
+            global_traversal_host = actx.to_numpy(d_trav)
             global_tree_host = global_traversal_host.tree
 
             # Get pyfmmlib expansion wrangler
             wrangler = FMMLibExpansionWrangler(
                     tree_indep, global_traversal_host,
-                    fmm_level_to_order=fmm_level_to_order)
+                    fmm_level_to_order=fmm_level_to_order,
+                    rotation_data=FMMLibRotationData(actx, global_traversal_host))
 
             # Compute FMM with one MPI rank
             from boxtree.fmm import drive_fmm
@@ -128,13 +130,13 @@ def wrangler_factory(local_traversal, global_traversal):
             )
 
             return DistributedFMMLibExpansionWrangler(
-                actx.context, comm, tree_indep, local_traversal, global_traversal,
+                comm, tree_indep, local_traversal, global_traversal,
                 fmm_level_to_order=fmm_level_to_order,
                 communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce)
 
         from boxtree.distributed import DistributedFMMRunner
         distributed_fmm_info = DistributedFMMRunner(
-            actx.queue, global_tree_host, tg, wrangler_factory, comm=comm)
+            actx, global_tree_host, tg, wrangler_factory, comm=comm)
 
         timing_data = {}
         pot_dfmm = distributed_fmm_info.drive_dfmm(
@@ -188,31 +190,42 @@ def test_against_shared(
 # {{{ test_constantone
 
 def _test_constantone(tmp_cache_basedir, dims, nsources, ntargets, dtype):
-    from boxtree.distributed.calculation import DistributedExpansionWrangler
+    from boxtree.distributed.calculation import DistributedExpansionWranglerMixin
 
     class ConstantOneExpansionWrangler(
-            ConstantOneExpansionWranglerBase, DistributedExpansionWrangler):
+            DistributedExpansionWranglerMixin,
+            ConstantOneExpansionWranglerBase):
         def __init__(
-                self, queue, comm, tree_indep, local_traversal, global_traversal):
-            DistributedExpansionWrangler.__init__(
-                self, queue, comm, global_traversal, False,
-                communicate_mpoles_via_allreduce=True)
+                self, array_context, comm,
+                tree_indep, local_traversal, global_traversal):
             ConstantOneExpansionWranglerBase.__init__(
                 self, tree_indep, local_traversal)
+
+            self._setup_actx = array_context
+            self.comm = comm
+            self.global_traversal = global_traversal
+            self.communicate_mpoles_via_allreduce = True
+
             self.level_orders = np.ones(local_traversal.tree.nlevels, dtype=np.int32)
 
         def reorder_sources(self, source_array):
-            if self.comm.Get_rank() == 0:
+            if self.is_mpi_root:
                 return source_array[self.global_traversal.tree.user_source_ids]
             else:
                 return None
 
         def reorder_potentials(self, potentials):
-            if self.comm.Get_rank() == 0:
+            if self.is_mpi_root:
                 return potentials[self.global_traversal.tree.sorted_target_ids]
             else:
                 return None
 
+        def finalize_potentials(self, potentials, template_ary):
+            if self.is_mpi_root:
+                return super().finalize_potentials(potentials, template_ary)
+            else:
+                return None
+
     from mpi4py import MPI
 
     # Get the current rank
@@ -229,14 +242,14 @@ def reorder_potentials(self, potentials):
         actx = _acf()
 
         from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(actx.context)
+        tg = FMMTraversalBuilder(actx)
 
         if rank == 0:
 
             # Generate random particles
             from boxtree.tools import make_normal_particle_array as p_normal
-            sources = p_normal(actx.queue, nsources, dims, dtype, seed=15)
-            targets = (p_normal(actx.queue, ntargets, dims, dtype, seed=18)
+            sources = p_normal(actx, nsources, dims, dtype, seed=15)
+            targets = (p_normal(actx, ntargets, dims, dtype, seed=18)
                        + np.array([2, 0, 0])[:dims])
 
             # Constant one source weights
@@ -244,21 +257,20 @@ def reorder_potentials(self, potentials):
 
             # Build the global tree
             from boxtree import TreeBuilder
-            tb = TreeBuilder(actx.context)
-            tree, _ = tb(
-                    actx.queue, sources, targets=targets, max_particles_in_box=30,
-                    debug=True)
-            tree = tree.get(actx.queue)
+            tb = TreeBuilder(actx)
+            tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30,
+                        debug=True)
+            tree = actx.to_numpy(tree)
 
         tree_indep = ConstantOneTreeIndependentDataForWrangler()
 
         def wrangler_factory(local_traversal, global_traversal):
             return ConstantOneExpansionWrangler(
-                    actx.queue, comm, tree_indep, local_traversal, global_traversal)
+                    actx, comm, tree_indep, local_traversal, global_traversal)
 
         from boxtree.distributed import DistributedFMMRunner
         distributed_fmm_info = DistributedFMMRunner(
-            actx.queue, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD)
+            actx, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD)
 
         pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights])
 
@@ -322,3 +334,5 @@ def test_constantone(tmp_path, num_processes, dims, nsources, ntargets):
             ntargets = 10000
 
             _test_against_shared(tmp_cache_basedir, dims, nsources, ntargets, dtype)
+
+# vim: fdm=marker

From f50c1d90f14ed8a6abd6078d3585188fc64c329e Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sun, 26 Jun 2022 20:00:26 +0300
Subject: [PATCH 21/28] port examples to arraycontext

---
 examples/cost_model.py | 44 ++++++++++++++----------------------------
 examples/demo.py       | 31 +++++++++++++++--------------
 2 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/examples/cost_model.py b/examples/cost_model.py
index 60943210..7b783891 100644
--- a/examples/cost_model.py
+++ b/examples/cost_model.py
@@ -1,31 +1,18 @@
 import logging
 import os
-import sys
 
 import numpy as np
 
 import pyopencl as cl
 
 
-# Configure the root logger
 logging.basicConfig(level=os.environ.get("LOGLEVEL", "WARNING"))
-
 logger = logging.getLogger(__name__)
-
-# Set the logger level of this module to INFO so that logging outputs of this module
-# are shown
 logger.setLevel(logging.INFO)
 
-# `process_elapsed` in `ProcessTimer` is only supported for Python >= 3.3
-SUPPORTS_PROCESS_TIME = (sys.version_info >= (3, 3))
-
 
 def demo_cost_model():
-    if not SUPPORTS_PROCESS_TIME:
-        raise NotImplementedError(
-            "Currently this script uses process time which only works on Python>=3.3"
-        )
-
+    from boxtree.array_context import PyOpenCLArrayContext
     from boxtree.pyfmmlib_integration import (
         FMMLibExpansionWrangler,
         FMMLibTreeIndependentDataForWrangler,
@@ -40,6 +27,7 @@ def demo_cost_model():
 
     ctx = cl.create_some_context()
     queue = cl.CommandQueue(ctx)
+    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
 
     traversals = []
     traversals_dev = []
@@ -53,31 +41,29 @@ def fmm_level_to_order(tree, ilevel):
         # {{{ Generate sources, targets and target_radii
 
         from boxtree.tools import make_normal_particle_array as p_normal
-        sources = p_normal(queue, nsources, dims, dtype, seed=15)
-        targets = p_normal(queue, ntargets, dims, dtype, seed=18)
-
-        from pyopencl.clrandom import PhiloxGenerator
+        sources = p_normal(actx, nsources, dims, dtype, seed=15)
+        targets = p_normal(actx, ntargets, dims, dtype, seed=18)
 
-        clrng = PhiloxGenerator(queue.context, seed=22)
-        target_radii = clrng.uniform(
-            queue, ntargets, a=0, b=0.05, dtype=dtype
-        ).get()
+        rng = np.random.default_rng(seed=22)
+        target_radii = actx.from_numpy(
+            rng.uniform(low=0.0, high=0.05, size=ntargets)
+        )
 
         # }}}
 
         # {{{ Generate tree and traversal
 
         from boxtree import TreeBuilder
-        tb = TreeBuilder(ctx)
+        tb = TreeBuilder(actx)
         tree, _ = tb(
-            queue, sources, targets=targets, target_radii=target_radii,
+            actx, sources, targets=targets, target_radii=target_radii,
             stick_out_factor=0.15, max_particles_in_box=30, debug=True
         )
 
         from boxtree.traversal import FMMTraversalBuilder
-        tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2)
-        trav_dev, _ = tg(queue, tree, debug=True)
-        trav = trav_dev.get(queue=queue)
+        tg = FMMTraversalBuilder(actx, well_sep_is_n_away=2)
+        trav_dev, _ = tg(actx, tree, debug=True)
+        trav = actx.to_numpy(trav_dev)
 
         traversals.append(trav)
         traversals_dev.append(trav_dev)
@@ -107,7 +93,7 @@ def fmm_level_to_order(tree, ilevel):
         traversal = traversals_dev[icase]
         model_results.append(
             cost_model.cost_per_stage(
-                queue, traversal, level_orders_list[icase],
+                actx, traversal, level_orders_list[icase],
                 FMMCostModel.get_unit_calibration_params(),
             )
         )
@@ -118,7 +104,7 @@ def fmm_level_to_order(tree, ilevel):
     )
 
     predicted_time = cost_model.cost_per_stage(
-        queue, traversals_dev[-1], level_orders_list[-1], params,
+        actx, traversals_dev[-1], level_orders_list[-1], params,
     )
     queue.finish()
 
diff --git a/examples/demo.py b/examples/demo.py
index cb87bef7..f3ce1715 100644
--- a/examples/demo.py
+++ b/examples/demo.py
@@ -8,8 +8,12 @@
 
 logging.basicConfig(level="INFO")
 
+from boxtree.array_context import PyOpenCLArrayContext
+
+
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
+actx = PyOpenCLArrayContext(queue, force_device_scalars=True)
 
 dims = 2
 nparticles = 500
@@ -17,16 +21,13 @@
 # -----------------------------------------------------------------------------
 # generate some random particle positions
 # -----------------------------------------------------------------------------
-from pyopencl.clrandom import PhiloxGenerator
-
-
-rng = PhiloxGenerator(ctx, seed=15)
-
 from pytools.obj_array import make_obj_array
 
 
+rng = np.random.default_rng(seed=15)
+
 particles = make_obj_array([
-    rng.normal(queue, nparticles, dtype=np.float64)
+    actx.from_numpy(rng.normal(size=nparticles))
     for i in range(dims)])
 
 # -----------------------------------------------------------------------------
@@ -35,14 +36,14 @@
 from boxtree import TreeBuilder
 
 
-tb = TreeBuilder(ctx)
-tree, _ = tb(queue, particles, max_particles_in_box=5)
+tb = TreeBuilder(actx)
+tree, _ = tb(actx, particles, max_particles_in_box=5)
 
 from boxtree.traversal import FMMTraversalBuilder
 
 
-tg = FMMTraversalBuilder(ctx)
-trav, _ = tg(queue, tree)
+tg = FMMTraversalBuilder(actx)
+trav, _ = tg(actx, tree)
 
 # ENDEXAMPLE
 
@@ -50,15 +51,17 @@
 # plot the tree
 # -----------------------------------------------------------------------------
 
-import matplotlib.pyplot as pt
-
+particles = actx.to_numpy(particles)
+tree = actx.to_numpy(tree)
 
-pt.plot(particles[0].get(), particles[1].get(), "+")
+import matplotlib.pyplot as pt
 
 from boxtree.visualization import TreePlotter
 
 
-plotter = TreePlotter(tree.get(queue=queue))
+pt.plot(particles[0], particles[1], "+")
+plotter = TreePlotter(tree)
+
 plotter.draw_tree(fill=False, edgecolor="black")
 # plotter.draw_box_numbers()
 plotter.set_bounding_box()

From 8efb304b70b1706c89f133ef1516b46028301777 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Sun, 26 Jun 2022 20:18:43 +0300
Subject: [PATCH 22/28] remove ImmutableHostDeviceArray

---
 boxtree/tools.py | 92 ++++--------------------------------------------
 1 file changed, 6 insertions(+), 86 deletions(-)

diff --git a/boxtree/tools.py b/boxtree/tools.py
index ab4240c6..8c72275f 100644
--- a/boxtree/tools.py
+++ b/boxtree/tools.py
@@ -127,7 +127,7 @@ def get_2d_knl(dtype):
 
             knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-            return knl.executor(queue.context)
+            return knl.executor(actx.context)
 
         _evt, result = get_2d_knl(dtype)(actx.queue, n=nparticles)
 
@@ -161,7 +161,7 @@ def get_3d_knl(dtype):
             knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
             knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
 
-            return knl.executor(queue.context)
+            return knl.executor(actx.context)
 
         _evt, result = get_3d_knl(dtype)(actx.queue, n=n)
 
@@ -204,7 +204,7 @@ def get_2d_knl(dtype):
             knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
             knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
 
-            return knl.executor(queue.context)
+            return knl.executor(actx.context)
 
         _evt, result = get_2d_knl(dtype)(actx.queue, n=n)
 
@@ -252,7 +252,7 @@ def get_3d_knl(dtype):
             knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
             knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")
 
-            return knl.executor(queue.context)
+            return knl.executor(actx.context)
 
         _evt, result = get_3d_knl(dtype)(actx.queue, n=n)
 
@@ -326,9 +326,8 @@ def transform_val(val):
     def get(self, queue, **kwargs):
         """
         :returns: a copy of *self* in which all data lives on the host, i.e.
-            all :class:`pyopencl.array.Array` and `ImmutableHostDeviceArray`
-            objects are replaced by corresponding :class:`numpy.ndarray`
-            instances on the host.
+            all :class:`pyopencl.array.Array` objects are replaced by
+            corresponding :class:`numpy.ndarray` instances on the host.
         """
         from warnings import warn
         warn(f"{type(self).__name__}.get is deprecated and will be removed "
@@ -336,9 +335,6 @@ def get(self, queue, **kwargs):
             DeprecationWarning, stacklevel=2)
 
         def try_get(attr):
-            if isinstance(attr, ImmutableHostDeviceArray):
-                return attr.host
-
             try:
                 return attr.get(queue=queue, **kwargs)
             except AttributeError:
@@ -383,8 +379,6 @@ def to_device(self, queue, exclude_fields=frozenset()):
         def _to_device(attr):
             if isinstance(attr, np.ndarray):
                 return cl.array.to_device(queue, attr).with_queue(None)
-            elif isinstance(attr, ImmutableHostDeviceArray):
-                return attr.device
             elif isinstance(attr, DeviceDataRecord):
                 return attr.to_device(queue)
             else:
@@ -392,31 +386,6 @@ def _to_device(attr):
 
         return self._transform_arrays(_to_device, exclude_fields=exclude_fields)
 
-    def to_host_device_array(self, queue, exclude_fields=frozenset()):
-        """
-        :arg exclude_fields: a :class:`frozenset` containing fields excluded
-            from transformed to `ImmutableHostDeviceArray`.
-
-        :returns: a copy of *self* where all device and host arrays are
-            transformed to `ImmutableHostDeviceArray` objects.
-        """
-        from warnings import warn
-        warn(f"{type(self).__name__}.to_host_device_array is deprecated and will "
-            "be removed in 2025. Switch from ImmutableHostDeviceArray.",
-            DeprecationWarning, stacklevel=2)
-
-        def _to_host_device_array(attr):
-            if isinstance(attr, np.ndarray | cl.array.Array):
-                return ImmutableHostDeviceArray(queue, attr)
-            elif isinstance(attr, DeviceDataRecord):
-                return attr.to_host_device_array(queue)
-            else:
-                return attr
-
-        return self._transform_arrays(
-            _to_host_device_array, exclude_fields=exclude_fields
-        )
-
 # }}}
 
 
@@ -910,55 +879,6 @@ def run_mpi(script: str, num_processes: int, env: dict[str, Any]) -> None:
 # }}}
 
 
-# {{{ HostDeviceArray
-
-class ImmutableHostDeviceArray:
-    """Interface for arrays on both host and device.
-
-    .. note:: This interface assumes the array is immutable. The behavior of
-    modifying the content of either the host array or the device array is undefined.
-
-    @TODO: Once available, replace this implementation with PyOpenCL's in-house
-    implementation.
-    """
-    def __init__(self, queue, array):
-        self.queue = queue
-        self.shape = array.shape
-        self.host_array = None
-        self.device_array = None
-
-        if isinstance(array, np.ndarray):
-            self.host_array = array
-        elif isinstance(array, cl.array.Array):
-            self.device_array = array
-
-    def with_queue(self, queue):
-        self.queue = queue
-
-    @property
-    def svm_capable(self):
-        svm_capabilities = \
-            self.queue.device.get_info(cl.device_info.SVM_CAPABILITIES)
-        return svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER != 0
-
-    @property
-    def host(self):
-        if self.host_array is None:
-            self.host_array = self.device_array.get(self.queue)
-        return self.host_array
-
-    @property
-    def device(self):
-        if self.device_array is None:
-            # @TODO: Use SVM
-            self.device_array = cl.array.to_device(self.queue, self.host_array)
-
-        self.device_array.with_queue(self.queue)
-        return self.device_array
-
-# }}}
-
-
 # {{{ coord_vec tools
 
 def get_coord_vec_dtype(

From 0aee29841ae6512dfd2eea8228db78c60d30d524 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 24 Jun 2022 19:36:54 +0300
Subject: [PATCH 23/28] docs: add arraycontext

---
 .pylintrc-local.yml |   4 --
 boxtree/__init__.py |   8 +--
 boxtree/tree.py     |   4 --
 doc/Makefile        | 136 +++++---------------------------------------
 doc/tools.rst       |   2 +
 5 files changed, 19 insertions(+), 135 deletions(-)

diff --git a/.pylintrc-local.yml b/.pylintrc-local.yml
index 745ae717..e6cde7f0 100644
--- a/.pylintrc-local.yml
+++ b/.pylintrc-local.yml
@@ -3,7 +3,3 @@
 
 - arg: extension-pkg-whitelist
   val: pyfmmlib
-
-# Needed for boxtree.tools
-- arg: init-hook
-  val: import sys; sys.setrecursionlimit(2000)
diff --git a/boxtree/__init__.py b/boxtree/__init__.py
index 6bfda26d..c8928c79 100644
--- a/boxtree/__init__.py
+++ b/boxtree/__init__.py
@@ -140,15 +140,15 @@
 two arrays, one whose name ends in ``_starts``, and another whose
 name ends in ``_lists``. For example,
 suppose we would like to find the colleagues of box #17 using
-:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_starts`
+:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_starts`
 and
-:attr:`boxtree.traversal.FMMTraversalInfo.colleagues_lists`.
+:attr:`boxtree.traversal.FMMTraversalInfo.same_level_non_well_sep_boxes_lists`.
 
 The following snippet of code achieves this::
 
     ibox = 17
-    start, end = colleagues_starts[ibox:ibox+2]
-    ibox_colleagues = colleagues_lists[start:end]
+    start, end = same_level_non_well_sep_boxes_starts[ibox:ibox+2]
+    ibox_colleagues = same_level_non_well_sep_boxes_lists[start:end]
 
 This indexing scheme has the following properties:
 
diff --git a/boxtree/tree.py b/boxtree/tree.py
index 857c04cc..f4a97d1f 100644
--- a/boxtree/tree.py
+++ b/boxtree/tree.py
@@ -52,10 +52,6 @@
 ^^^^^
 
 .. autoclass:: ParticleListFilter
-
-.. autofunction:: filter_target_lists_in_user_order
-
-.. autofunction:: filter_target_lists_in_tree_order
 """
 
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
diff --git a/doc/Makefile b/doc/Makefile
index bb66c0e8..d0ac5f2f 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,130 +1,20 @@
-# Makefile for Sphinx documentation
+# Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line.
-SPHINXOPTS    = -n
-SPHINXBUILD   = python $(shell which sphinx-build)
-PAPER         =
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= python $(shell which sphinx-build)
+SOURCEDIR     = .
 BUILDDIR      = _build
 
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
-
+# Put it first so that "make" without argument is like "make help".
 help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html       to make standalone HTML files"
-	@echo "  dirhtml    to make HTML files named index.html in directories"
-	@echo "  singlehtml to make a single large HTML file"
-	@echo "  pickle     to make pickle files"
-	@echo "  json       to make JSON files"
-	@echo "  htmlhelp   to make HTML files and a HTML help project"
-	@echo "  qthelp     to make HTML files and a qthelp project"
-	@echo "  devhelp    to make HTML files and a Devhelp project"
-	@echo "  epub       to make an epub"
-	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
-	@echo "  text       to make text files"
-	@echo "  man        to make manual pages"
-	@echo "  changes    to make an overview of all changed/added/deprecated items"
-	@echo "  linkcheck  to check all external links for integrity"
-	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	-rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-singlehtml:
-	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
-	@echo
-	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/boxtree.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/boxtree.qhc"
-
-devhelp:
-	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
-	@echo
-	@echo "Build finished."
-	@echo "To view the help file:"
-	@echo "# mkdir -p $$HOME/.local/share/devhelp/boxtree"
-	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/boxtree"
-	@echo "# devhelp"
-
-epub:
-	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
-	@echo
-	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make' in that directory to run these through (pdf)latex" \
-	      "(use \`make latexpdf' here to do that automatically)."
-
-latexpdf:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo "Running LaTeX files through pdflatex..."
-	make -C $(BUILDDIR)/latex all-pdf
-	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
-
-text:
-	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
-	@echo
-	@echo "Build finished. The text files are in $(BUILDDIR)/text."
-
-man:
-	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
-	@echo
-	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
+.PHONY: help Makefile
 
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/tools.rst b/doc/tools.rst
index 6db9bc70..0b5225ee 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -4,3 +4,5 @@ Utility Functionality
 .. automodule:: boxtree.timing
 
 .. automodule:: boxtree.constant_one
+
+.. automodule:: boxtree.array_context

From 4ca76b6ed513fabe7d6e46241390394d8a0955ea Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Wed, 7 Sep 2022 20:47:27 +0300
Subject: [PATCH 24/28] update fmm interface for sumpy

---
 boxtree/constant_one.py            | 34 ++++++++++++++-----
 boxtree/distributed/__init__.py    |  9 +++--
 boxtree/distributed/calculation.py | 10 +++---
 boxtree/fmm.py                     | 54 +++++++++++++++++++++++-------
 boxtree/pyfmmlib_integration.py    | 42 +++++++++++++++--------
 boxtree/traversal.py               |  5 ++-
 examples/cost_model.py             |  2 +-
 test/test_cost_model.py            |  4 +--
 test/test_distributed.py           |  6 ++--
 test/test_fmm.py                   | 39 +++++++++++----------
 10 files changed, 134 insertions(+), 71 deletions(-)

diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py
index 674efca8..11acf0b4 100644
--- a/boxtree/constant_one.py
+++ b/boxtree/constant_one.py
@@ -27,6 +27,7 @@
 
 import numpy as np
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
 from boxtree.timing import DummyTimingFuture
 
@@ -85,7 +86,9 @@ def local_expansions_view(self, local_exps, level):
     def timing_future(ops):
         return DummyTimingFuture.from_op_count(ops)
 
-    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+    def form_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_box_nrs,
+            source_boxes,
             src_weight_vecs):
         src_weights, = src_weight_vecs
         mpoles = self.multipole_expansion_zeros()
@@ -98,8 +101,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes,
 
         return mpoles, self.timing_future(ops)
 
-    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
-            source_parent_boxes, mpoles):
+    def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_parent_box_nrs,
+            source_parent_boxes,
+            mpoles):
         tree = self.tree
         ops = 0
 
@@ -121,7 +126,8 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs,
 
         return mpoles, self.timing_future(ops)
 
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
+    def eval_direct(self, actx: PyOpenCLArrayContext,
+            target_boxes, neighbor_sources_starts,
             neighbor_sources_lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         pot = self.output_zeros()
@@ -146,6 +152,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts,
         return pot, self.timing_future(ops)
 
     def multipole_to_local(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -166,7 +173,9 @@ def multipole_to_local(self,
         return local_exps, self.timing_future(ops)
 
     def eval_multipoles(self,
-            target_boxes_by_source_level, from_sep_smaller_nonsiblings_by_level,
+            actx: PyOpenCLArrayContext,
+            target_boxes_by_source_level,
+            from_sep_smaller_nonsiblings_by_level,
             mpole_exps):
         pot = self.output_zeros()
         ops = 0
@@ -188,8 +197,10 @@ def eval_multipoles(self,
         return pot, self.timing_future(ops)
 
     def form_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+            target_or_target_parent_boxes,
+            starts, lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         local_exps = self.local_expansion_zeros()
         ops = 0
@@ -211,7 +222,9 @@ def form_locals(self,
 
         return local_exps, self.timing_future(ops)
 
-    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
+    def refine_locals(self,
+            actx: PyOpenCLArrayContext,
+            level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, local_exps):
         ops = 0
 
@@ -224,7 +237,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs,
 
         return local_exps, self.timing_future(ops)
 
-    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+    def eval_locals(self,
+            actx: PyOpenCLArrayContext,
+            level_start_target_box_nrs,
+            target_boxes, local_exps):
         pot = self.output_zeros()
         ops = 0
 
@@ -235,7 +251,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
 
         return pot, self.timing_future(ops)
 
-    def finalize_potentials(self, potentials, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials):
         return potentials
 
 # }}}
diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py
index 73881352..102a2531 100644
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
@@ -293,11 +293,14 @@ def __init__(self, array_context: PyOpenCLArrayContext, global_tree,
                 array_context, global_tree, traversal_builder, wrangler_factory,
                 calibration_params, comm)
 
-    def drive_dfmm(self, source_weights, timing_data=None):
-        """Calculate potentials at target points.
-        """
+    def drive_dfmm(self,
+            actx: PyOpenCLArrayContext,
+            source_weights,
+            timing_data=None):
+        """Calculate potentials at target points."""
         from boxtree.fmm import drive_fmm
         return drive_fmm(
+            actx,
             self.wrangler, source_weights,
             timing_data=timing_data,
             global_src_idx_all_ranks=self.src_idx_all_ranks,
diff --git a/boxtree/distributed/calculation.py b/boxtree/distributed/calculation.py
index 22ad296d..d85cc6dc 100644
--- a/boxtree/distributed/calculation.py
+++ b/boxtree/distributed/calculation.py
@@ -72,7 +72,8 @@ def mpi_size(self):
     def is_mpi_root(self):
         return self.mpi_rank == 0
 
-    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+    def distribute_source_weights(self,
+            actx: PyOpenCLArrayContext, src_weight_vecs, src_idx_all_ranks):
         if self.is_mpi_root:
             distribute_weight_req = []
             local_src_weight_vecs = np.empty((self.mpi_size,), dtype=object)
@@ -95,7 +96,8 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
 
         return local_src_weight_vecs
 
-    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+    def gather_potential_results(self,
+            actx: PyOpenCLArrayContext, potentials, tgt_idx_all_ranks):
         from boxtree.distributed import dtype_to_mpi
         potentials_mpi_type = dtype_to_mpi(potentials.dtype)
         gathered_potentials = None
@@ -256,8 +258,8 @@ def find_boxes_used_by_subrange(
 
         return box_in_subrange
 
-    def communicate_mpoles(self, actx: PyOpenCLArrayContext,
-                           mpole_exps, return_stats=False):
+    def communicate_mpoles(self,
+            actx: PyOpenCLArrayContext, mpole_exps, return_stats=False):
         """Based on Algorithm 3: Reduce and Scatter in Lashuk et al. [1]_.
 
         The main idea is to mimic an allreduce as done on a hypercube network, but to
diff --git a/boxtree/fmm.py b/boxtree/fmm.py
index 6c13290c..760e67bd 100644
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
@@ -32,6 +32,7 @@
 
 from pytools import ProcessLogger
 
+from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.traversal import FMMTraversalInfo
 from boxtree.tree import Tree
 
@@ -156,6 +157,7 @@ def local_expansions_view(self, local_exps, level):
 
     @abstractmethod
     def form_multipoles(self,
+            actx: PyOpenCLArrayContext,
             level_start_source_box_nrs, source_boxes,
             src_weight_vecs):
         """Return an expansions array
@@ -168,6 +170,7 @@ def form_multipoles(self,
 
     @abstractmethod
     def coarsen_multipoles(self,
+            actx: PyOpenCLArrayContext,
             level_start_source_parent_box_nrs,
             source_parent_boxes, mpoles):
         """For each box in *source_parent_boxes*,
@@ -180,6 +183,7 @@ def coarsen_multipoles(self,
 
     @abstractmethod
     def eval_direct(self,
+            actx: PyOpenCLArrayContext,
             target_boxes, neighbor_sources_starts,
             neighbor_sources_lists, src_weight_vecs):
         """For each box in *target_boxes*, evaluate the influence of the
@@ -192,6 +196,7 @@ def eval_direct(self,
 
     @abstractmethod
     def multipole_to_local(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -206,6 +211,7 @@ def multipole_to_local(self,
 
     @abstractmethod
     def eval_multipoles(self,
+            actx: PyOpenCLArrayContext,
             target_boxes_by_source_level, from_sep_smaller_by_level, mpole_exps):
         """For a level *i*, each box in *target_boxes_by_source_level[i]*, evaluate
         the multipole expansion in *mpole_exps* in the nearby boxes given in
@@ -219,6 +225,7 @@ def eval_multipoles(self,
 
     @abstractmethod
     def form_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, starts, lists, src_weight_vecs):
         """For each box in *target_or_target_parent_boxes*, form local
@@ -233,6 +240,7 @@ def form_locals(self,
 
     @abstractmethod
     def refine_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, local_exps):
         """For each box in *child_boxes*,
@@ -244,6 +252,7 @@ def refine_locals(self,
 
     @abstractmethod
     def eval_locals(self,
+            actx: PyOpenCLArrayContext,
             level_start_target_box_nrs, target_boxes, local_exps):
         """For each box in *target_boxes*, evaluate the local expansion in
         *local_exps* and return a new potential array.
@@ -255,7 +264,7 @@ def eval_locals(self,
     # }}}
 
     @abstractmethod
-    def finalize_potentials(self, potentials, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials):
         """
         Postprocess the reordered potentials. This is where global scaling
         factors could be applied. This is distinct from :meth:`reorder_potentials`
@@ -269,7 +278,9 @@ def finalize_potentials(self, potentials, template_ary):
             type.
         """
 
-    def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
+    def distribute_source_weights(self,
+            actx: PyOpenCLArrayContext,
+            src_weight_vecs, src_idx_all_ranks):
         """Used by the distributed implementation for transferring needed source
         weights from root rank to each worker rank in the communicator.
 
@@ -289,7 +300,9 @@ def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks):
         """
         return src_weight_vecs
 
-    def gather_potential_results(self, potentials, tgt_idx_all_ranks):
+    def gather_potential_results(self,
+            actx: PyOpenCLArrayContext,
+            potentials, tgt_idx_all_ranks):
         """Used by the distributed implementation for gathering calculated potentials
         from all worker ranks in the communicator to the root rank.
 
@@ -306,7 +319,9 @@ def gather_potential_results(self, potentials, tgt_idx_all_ranks):
         """
         return potentials
 
-    def communicate_mpoles(self, mpole_exps, return_stats=False):  # noqa: B027
+    def communicate_mpoles(self,                # noqa: B027
+            actx: PyOpenCLArrayContext,
+            mpole_exps, return_stats=False):
         """Used by the distributed implementation for forming the complete multipole
         expansions from the partial multipole expansions.
 
@@ -325,9 +340,12 @@ def communicate_mpoles(self, mpole_exps, return_stats=False):  # noqa: B027
 # }}}
 
 
-def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
+def drive_fmm(actx: PyOpenCLArrayContext,
+              wrangler: ExpansionWranglerInterface,
+              src_weight_vecs, *,
               timing_data=None,
-              global_src_idx_all_ranks=None, global_tgt_idx_all_ranks=None):
+              global_src_idx_all_ranks=None,
+              global_tgt_idx_all_ranks=None):
     """Top-level driver routine for a fast multipole calculation.
 
     In part, this is intended as a template for custom FMMs, in the sense that
@@ -374,15 +392,17 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     fmm_proc = ProcessLogger(logger, "fmm")
     recorder = TimingRecorder()
 
-    src_weight_vecs = [wrangler.reorder_sources(weight) for
-        weight in src_weight_vecs]
+    src_weight_vecs = [
+            wrangler.reorder_sources(weight) for weight in src_weight_vecs]
 
     src_weight_vecs = wrangler.distribute_source_weights(
-        src_weight_vecs, global_src_idx_all_ranks)
+            actx,
+            src_weight_vecs, global_src_idx_all_ranks)
 
     # {{{ "Step 2.1:" Construct local multipoles
 
     mpole_exps, timing_future = wrangler.form_multipoles(
+            actx,
             traversal.level_start_source_box_nrs,
             traversal.source_boxes,
             src_weight_vecs)
@@ -394,6 +414,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Step 2.2:" Propagate multipoles upward
 
     mpole_exps, timing_future = wrangler.coarsen_multipoles(
+            actx,
             traversal.level_start_source_parent_box_nrs,
             traversal.source_parent_boxes,
             mpole_exps)
@@ -404,11 +425,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
 
     # }}}
 
-    wrangler.communicate_mpoles(mpole_exps)
+    wrangler.communicate_mpoles(actx, mpole_exps)
 
     # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
 
     potentials, timing_future = wrangler.eval_direct(
+            actx,
             traversal.target_boxes,
             traversal.neighbor_source_boxes_starts,
             traversal.neighbor_source_boxes_lists,
@@ -423,6 +445,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local
 
     local_exps, timing_future = wrangler.multipole_to_local(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_siblings_starts,
@@ -441,6 +464,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # contribution *out* of the downward-propagating local expansions)
 
     mpole_result, timing_future = wrangler.eval_multipoles(
+            actx,
             traversal.target_boxes_sep_smaller_by_source_level,
             traversal.from_sep_smaller_by_level,
             mpole_exps)
@@ -456,6 +480,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
                 "('list 3 close')")
 
         direct_result, timing_future = wrangler.eval_direct(
+                actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
@@ -470,6 +495,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")
 
     local_result, timing_future = wrangler.form_locals(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             traversal.from_sep_bigger_starts,
@@ -482,6 +508,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
 
     if traversal.from_sep_close_bigger_starts is not None:
         direct_result, timing_future = wrangler.eval_direct(
+                actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
@@ -496,6 +523,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 7:" propagate local_exps downward
 
     local_exps, timing_future = wrangler.refine_locals(
+            actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             local_exps)
@@ -507,6 +535,7 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # {{{ "Stage 8:" evaluate locals
 
     local_result, timing_future = wrangler.eval_locals(
+            actx,
             traversal.level_start_target_box_nrs,
             traversal.target_boxes,
             local_exps)
@@ -518,11 +547,12 @@ def drive_fmm(wrangler: ExpansionWranglerInterface, src_weight_vecs,
     # }}}
 
     potentials = wrangler.gather_potential_results(
-                    potentials, global_tgt_idx_all_ranks)
+            actx,
+            potentials, global_tgt_idx_all_ranks)
 
     result = wrangler.reorder_potentials(potentials)
 
-    result = wrangler.finalize_potentials(result, template_ary=src_weight_vecs[0])
+    result = wrangler.finalize_potentials(actx, result)
 
     fmm_proc.done()
 
diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py
index 869805b6..4f0ce75c 100644
--- a/boxtree/pyfmmlib_integration.py
+++ b/boxtree/pyfmmlib_integration.py
@@ -676,7 +676,9 @@ def reorder_potentials(self, potentials):
 
     @log_process(logger)
     @return_timing_data
-    def form_multipoles(self, level_start_source_box_nrs, source_boxes,
+    def form_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_box_nrs,
+            source_boxes,
             src_weight_vecs):
         src_weights, = src_weight_vecs
         formmp = self.tree_indep.get_routine(
@@ -719,8 +721,10 @@ def form_multipoles(self, level_start_source_box_nrs, source_boxes,
 
     @log_process(logger)
     @return_timing_data
-    def coarsen_multipoles(self, level_start_source_parent_box_nrs,
-            source_parent_boxes, mpoles):
+    def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
+            level_start_source_parent_box_nrs,
+            source_parent_boxes,
+            mpoles):
         tree = self.tree
 
         mpmp = self.tree_indep.get_translation_routine(self, "%ddmpmp")
@@ -775,8 +779,11 @@ def coarsen_multipoles(self, level_start_source_parent_box_nrs,
 
     @log_process(logger)
     @return_timing_data
-    def eval_direct(self, target_boxes, neighbor_sources_starts,
-            neighbor_sources_lists, src_weight_vecs):
+    def eval_direct(self, actx: PyOpenCLArrayContext,
+            target_boxes,
+            neighbor_sources_starts,
+            neighbor_sources_lists,
+            src_weight_vecs):
         src_weights, = src_weight_vecs
         output = self.output_zeros()
 
@@ -819,7 +826,7 @@ def eval_direct(self, target_boxes, neighbor_sources_starts,
 
     @log_process(logger)
     @return_timing_data
-    def multipole_to_local(self,
+    def multipole_to_local(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
@@ -934,8 +941,9 @@ def multipole_to_local(self,
 
     @log_process(logger)
     @return_timing_data
-    def eval_multipoles(self,
-            target_boxes_by_source_level, sep_smaller_nonsiblings_by_level,
+    def eval_multipoles(self, actx: PyOpenCLArrayContext,
+            target_boxes_by_source_level,
+            sep_smaller_nonsiblings_by_level,
             mpole_exps):
         output = self.output_zeros()
 
@@ -977,9 +985,10 @@ def eval_multipoles(self,
 
     @log_process(logger)
     @return_timing_data
-    def form_locals(self,
+    def form_locals(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, starts, lists, src_weight_vecs):
+            target_or_target_parent_boxes,
+            starts, lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         local_exps = self.local_expansion_zeros()
 
@@ -1057,8 +1066,10 @@ def form_locals(self,
 
     @log_process(logger)
     @return_timing_data
-    def refine_locals(self, level_start_target_or_target_parent_box_nrs,
-            target_or_target_parent_boxes, local_exps):
+    def refine_locals(self, actx: PyOpenCLArrayContext,
+            level_start_target_or_target_parent_box_nrs,
+            target_or_target_parent_boxes,
+            local_exps):
 
         locloc = self.tree_indep.get_translation_routine(self, "%ddlocloc")
 
@@ -1104,7 +1115,10 @@ def refine_locals(self, level_start_target_or_target_parent_box_nrs,
 
     @log_process(logger)
     @return_timing_data
-    def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
+    def eval_locals(self, actx: PyOpenCLArrayContext,
+            level_start_target_box_nrs,
+            target_boxes,
+            local_exps):
         output = self.output_zeros()
         taeval = self.tree_indep.get_expn_eval_routine("ta")
 
@@ -1139,7 +1153,7 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps):
         return output
 
     @log_process(logger)
-    def finalize_potentials(self, potential, template_ary):
+    def finalize_potentials(self, actx: PyOpenCLArrayContext, potential):
         if self.tree_indep.eqn_letter == "l" and self.dim == 2:
             scale_factor = -1/(2*np.pi)
         elif self.tree_indep.eqn_letter == "h" and self.dim == 2:
diff --git a/boxtree/traversal.py b/boxtree/traversal.py
index f1fd5ecf..5bec4015 100644
--- a/boxtree/traversal.py
+++ b/boxtree/traversal.py
@@ -1706,7 +1706,8 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
             sources_are_targets, sources_have_extent, targets_have_extent,
             extent_norm,
             source_boxes_has_mask,
-            source_parent_boxes_has_mask):
+            source_parent_boxes_has_mask,
+            debug=False):
 
         # {{{ process from_sep_smaller_crit
 
@@ -1748,8 +1749,6 @@ def get_kernel_info(self, *, dimensions, particle_id_dtype, box_id_dtype,
 
         # }}}
 
-        debug = False
-
         from pyopencl.tools import dtype_to_ctype
 
         from boxtree.tree import box_flags_enum
diff --git a/examples/cost_model.py b/examples/cost_model.py
index 7b783891..8328672f 100644
--- a/examples/cost_model.py
+++ b/examples/cost_model.py
@@ -79,7 +79,7 @@ def fmm_level_to_order(tree, ilevel):
         timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-        drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
         timing_results.append(timing_data)
 
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index 7c382a1a..f3b53980 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -439,7 +439,7 @@ def fmm_level_to_order(tree, ilevel):
         timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-        drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
         timing_results.append(timing_data)
 
@@ -566,7 +566,7 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     timing_data = {}
     from boxtree.fmm import drive_fmm
     src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-    drive_fmm(wrangler, (src_weights,), timing_data=timing_data)
+    drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
 
     cost_model = FMMCostModel(
         translation_cost_model_factory=OpCountingTranslationCostModel
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 41e5ac6e..45da4bfc 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -120,7 +120,7 @@ def fmm_level_to_order(tree, level):
 
             # Compute FMM with one MPI rank
             from boxtree.fmm import drive_fmm
-            pot_fmm = drive_fmm(wrangler, [sources_weights]) * 2 * np.pi
+            pot_fmm = drive_fmm(actx, wrangler, [sources_weights]) * 2 * np.pi
 
         # Compute FMM using the distributed implementation
 
@@ -140,7 +140,7 @@ def wrangler_factory(local_traversal, global_traversal):
 
         timing_data = {}
         pot_dfmm = distributed_fmm_info.drive_dfmm(
-                    [sources_weights], timing_data=timing_data)
+            actx, [sources_weights], timing_data=timing_data)
         assert timing_data
 
     # Uncomment the following section to print the time taken of each stage
@@ -272,7 +272,7 @@ def wrangler_factory(local_traversal, global_traversal):
         distributed_fmm_info = DistributedFMMRunner(
             actx, tree, tg, wrangler_factory, comm=MPI.COMM_WORLD)
 
-        pot_dfmm = distributed_fmm_info.drive_dfmm([sources_weights])
+        pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights])
 
     if rank == 0:
         assert (np.all(pot_dfmm == nsources))
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 659ae64e..460e56fb 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -52,7 +52,8 @@
 
 # {{{ ref fmmlib pot computation
 
-def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host,
+def get_fmmlib_ref_pot(
+        actx, wrangler, weights, sources_host, targets_host,
         helmholtz_k, dipole_vec=None):
     dims = sources_host.shape[0]
     eqn_letter = "h" if helmholtz_k else "l"
@@ -85,10 +86,10 @@ def get_fmmlib_ref_pot(wrangler, weights, sources_host, targets_host,
         kwargs["zk"] = helmholtz_k
 
     return wrangler.finalize_potentials(
+            actx,
             fmmlib_routine(
                 sources=sources_host, targets=targets_host,
-                **kwargs)[0],
-            template_ary=weights)
+                **kwargs)[0])
 
 # }}}
 
@@ -275,7 +276,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
                 == weights)
 
     from boxtree.fmm import drive_fmm
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     if filter_kind:
         pot = pot[actx.to_numpy(flags) > 0]
@@ -293,7 +294,7 @@ def test_fmm_completeness(actx_factory, dims, nsources_req, ntargets_req,
         for i in range(nsources):
             unit_vec = np.zeros(nsources, dtype=dtype)
             unit_vec[i] = 1
-            mat[:, i] = drive_fmm(host_trav, wrangler, (unit_vec,))
+            mat[:, i] = drive_fmm(actx, wrangler, (unit_vec,))
             pb.progress()
         pb.finished()
 
@@ -407,8 +408,8 @@ def test_pyfmmlib_fmm(actx_factory, dims, use_dipoles, helmholtz_k):
             p_normal(actx, ntargets, dims, dtype, seed=18)
             + np.array([2, 0, 0])[:dims])
 
-    sources_host = particle_array_to_host(actx, sources)
-    targets_host = particle_array_to_host(actx, targets)
+    sources_host = np.stack(actx.to_numpy(sources))
+    targets_host = np.stack(actx.to_numpy(targets))
 
     from boxtree import TreeBuilder
     tb = TreeBuilder(actx)
@@ -459,7 +460,7 @@ def fmm_level_to_order(tree, lev):
     from boxtree.fmm import drive_fmm
 
     timing_data = {}
-    pot = drive_fmm(wrangler, (weights,), timing_data=timing_data)
+    pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data)
     print(timing_data)
     assert timing_data
 
@@ -467,8 +468,8 @@ def fmm_level_to_order(tree, lev):
 
     logger.info("computing direct (reference) result")
 
-    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources_host.T,
-            targets_host.T, helmholtz_k, dipole_vec)
+    ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources_host,
+            targets_host, helmholtz_k, dipole_vec)
 
     rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
     logger.info("relative l2 error vs fmmlib direct: %g", rel_err)
@@ -505,11 +506,9 @@ def fmm_level_to_order(tree, lev):
 
         if use_dipoles:
             knl = DirectionalSourceDerivative(knl)
-            sumpy_extra_kwargs["src_derivative_dir"] = dipole_vec
+            sumpy_extra_kwargs["src_derivative_dir"] = actx.from_numpy(dipole_vec)
 
-        p2p = P2P(actx.context,
-                [knl],
-                exclude_self=False)
+        p2p = P2P(target_kernels=[knl], exclude_self=False)
 
         result, = p2p(
                 actx,
@@ -592,14 +591,14 @@ def fmm_level_to_order(tree, lev):
             rotation_data=FMMLibRotationData(actx, trav))
 
     from boxtree.fmm import drive_fmm
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
     assert not np.isnan(pot).any()
 
     # {{{ ref fmmlib computation
 
     logger.info("computing direct (reference) result")
 
-    ref_pot = get_fmmlib_ref_pot(wrangler, weights, sources, targets,
+    ref_pot = get_fmmlib_ref_pot(actx, wrangler, weights, sources, targets,
             helmholtz_k)
 
     rel_err = la.norm(pot - ref_pot, np.inf) / la.norm(ref_pot, np.inf)
@@ -661,7 +660,7 @@ def test_interaction_list_particle_count_thresholding(actx_factory, enable_exten
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
 
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     assert np.all(pot == weights_sum)
 
@@ -715,7 +714,7 @@ def test_fmm_float32(actx_factory, enable_extents):
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, host_trav)
 
-    pot = drive_fmm(wrangler, (weights,))
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     assert np.all(pot == weights_sum)
 
@@ -791,11 +790,11 @@ def fmm_level_to_order(tree, lev):
 
     baseline_timing_data = {}
     baseline_pot = drive_fmm(
-            baseline_wrangler, (weights,), timing_data=baseline_timing_data)
+        actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data)
 
     optimized_timing_data = {}
     optimized_pot = drive_fmm(
-            optimized_wrangler, (weights,), timing_data=optimized_timing_data)
+        actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data)
 
     baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"]
     if baseline_time is not None:

From 4024531a397a8254197adefa18047688a69b9e09 Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 23 Sep 2022 20:38:42 +0300
Subject: [PATCH 25/28] rip out timing collection

---
 boxtree/constant_one.py         |  41 ++------
 boxtree/distributed/__init__.py |   6 +-
 boxtree/fmm.py                  |  93 ++++++-----------
 boxtree/pyfmmlib_integration.py |  15 +--
 boxtree/timing.py               | 171 --------------------------------
 doc/misc.rst                    |  19 +++-
 doc/tools.rst                   |   2 -
 examples/cost_model.py          |   8 +-
 test/test_cost_model.py         |  14 +--
 test/test_distributed.py        |  16 +--
 test/test_fmm.py                |  22 +---
 11 files changed, 69 insertions(+), 338 deletions(-)
 delete mode 100644 boxtree/timing.py

diff --git a/boxtree/constant_one.py b/boxtree/constant_one.py
index 11acf0b4..a771e44e 100644
--- a/boxtree/constant_one.py
+++ b/boxtree/constant_one.py
@@ -29,7 +29,6 @@
 
 from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
-from boxtree.timing import DummyTimingFuture
 
 
 # {{{ constant one wrangler
@@ -45,9 +44,6 @@ class ConstantOneExpansionWrangler(ExpansionWranglerInterface):
     """This implements the 'analytical routines' for a Green's function that is
     constant 1 everywhere. For 'charges' of 'ones', this should get every particle
     a copy of the particle count.
-
-    Timing results returned by this wrangler contain the field *ops_elapsed*,
-    which counts approximately the number of floating-point operations required.
     """
 
     def _get_source_slice(self, ibox):
@@ -82,31 +78,24 @@ def local_expansions_view(self, local_exps, level):
         # FIXME
         raise NotImplementedError
 
-    @staticmethod
-    def timing_future(ops):
-        return DummyTimingFuture.from_op_count(ops)
-
     def form_multipoles(self, actx: PyOpenCLArrayContext,
             level_start_source_box_nrs,
             source_boxes,
             src_weight_vecs):
         src_weights, = src_weight_vecs
         mpoles = self.multipole_expansion_zeros()
-        ops = 0
 
         for ibox in source_boxes:
             pslice = self._get_source_slice(ibox)
             mpoles[ibox] += np.sum(src_weights[pslice])
-            ops += src_weights[pslice].size
 
-        return mpoles, self.timing_future(ops)
+        return mpoles
 
     def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
             level_start_source_parent_box_nrs,
             source_parent_boxes,
             mpoles):
         tree = self.tree
-        ops = 0
 
         # nlevels-1 is the last valid level index
         # nlevels-2 is the last valid level that could have children
@@ -122,16 +111,14 @@ def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
                 for child in tree.box_child_ids[:, ibox]:
                     if child:
                         mpoles[ibox] += mpoles[child]
-                        ops += 1
 
-        return mpoles, self.timing_future(ops)
+        return mpoles
 
     def eval_direct(self, actx: PyOpenCLArrayContext,
             target_boxes, neighbor_sources_starts,
             neighbor_sources_lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         pot = self.output_zeros()
-        ops = 0
 
         for itgt_box, tgt_ibox in enumerate(target_boxes):
             tgt_pslice = self._get_target_slice(tgt_ibox)
@@ -147,9 +134,8 @@ def eval_direct(self, actx: PyOpenCLArrayContext,
                 src_sum += np.sum(src_weights[src_pslice])
 
             pot[tgt_pslice] = src_sum
-            ops += pot[tgt_pslice].size * nsrcs
 
-        return pot, self.timing_future(ops)
+        return pot
 
     def multipole_to_local(self,
             actx: PyOpenCLArrayContext,
@@ -157,7 +143,6 @@ def multipole_to_local(self,
             target_or_target_parent_boxes,
             starts, lists, mpole_exps):
         local_exps = self.local_expansion_zeros()
-        ops = 0
 
         for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
             start, end = starts[itgt_box:itgt_box+2]
@@ -166,11 +151,10 @@ def multipole_to_local(self,
             # print tgt_ibox, "<-", lists[start:end]
             for src_ibox in lists[start:end]:
                 contrib += mpole_exps[src_ibox]
-                ops += 1
 
             local_exps[tgt_ibox] += contrib
 
-        return local_exps, self.timing_future(ops)
+        return local_exps
 
     def eval_multipoles(self,
             actx: PyOpenCLArrayContext,
@@ -178,7 +162,6 @@ def eval_multipoles(self,
             from_sep_smaller_nonsiblings_by_level,
             mpole_exps):
         pot = self.output_zeros()
-        ops = 0
 
         for level, ssn in enumerate(from_sep_smaller_nonsiblings_by_level):
             for itgt_box, tgt_ibox in \
@@ -192,9 +175,8 @@ def eval_multipoles(self,
                     contrib += mpole_exps[src_ibox]
 
                 pot[tgt_pslice] += contrib
-                ops += pot[tgt_pslice].size * (end - start)
 
-        return pot, self.timing_future(ops)
+        return pot
 
     def form_locals(self,
             actx: PyOpenCLArrayContext,
@@ -203,7 +185,6 @@ def form_locals(self,
             starts, lists, src_weight_vecs):
         src_weights, = src_weight_vecs
         local_exps = self.local_expansion_zeros()
-        ops = 0
 
         for itgt_box, tgt_ibox in enumerate(target_or_target_parent_boxes):
             start, end = starts[itgt_box:itgt_box+2]
@@ -218,38 +199,32 @@ def form_locals(self,
                 contrib += np.sum(src_weights[src_pslice])
 
             local_exps[tgt_ibox] += contrib
-            ops += nsrcs
 
-        return local_exps, self.timing_future(ops)
+        return local_exps
 
     def refine_locals(self,
             actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes, local_exps):
-        ops = 0
-
         for target_lev in range(1, self.tree.nlevels):
             start, stop = level_start_target_or_target_parent_box_nrs[
                     target_lev:target_lev+2]
             for ibox in target_or_target_parent_boxes[start:stop]:
                 local_exps[ibox] += local_exps[self.tree.box_parent_ids[ibox]]
-                ops += 1
 
-        return local_exps, self.timing_future(ops)
+        return local_exps
 
     def eval_locals(self,
             actx: PyOpenCLArrayContext,
             level_start_target_box_nrs,
             target_boxes, local_exps):
         pot = self.output_zeros()
-        ops = 0
 
         for ibox in target_boxes:
             tgt_pslice = self._get_target_slice(ibox)
             pot[tgt_pslice] += local_exps[ibox]
-            ops += pot[tgt_pslice].size
 
-        return pot, self.timing_future(ops)
+        return pot
 
     def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials):
         return potentials
diff --git a/boxtree/distributed/__init__.py b/boxtree/distributed/__init__.py
index 102a2531..7974ab58 100644
--- a/boxtree/distributed/__init__.py
+++ b/boxtree/distributed/__init__.py
@@ -293,16 +293,12 @@ def __init__(self, array_context: PyOpenCLArrayContext, global_tree,
                 array_context, global_tree, traversal_builder, wrangler_factory,
                 calibration_params, comm)
 
-    def drive_dfmm(self,
-            actx: PyOpenCLArrayContext,
-            source_weights,
-            timing_data=None):
+    def drive_dfmm(self, actx: PyOpenCLArrayContext, source_weights):
         """Calculate potentials at target points."""
         from boxtree.fmm import drive_fmm
         return drive_fmm(
             actx,
             self.wrangler, source_weights,
-            timing_data=timing_data,
             global_src_idx_all_ranks=self.src_idx_all_ranks,
             global_tgt_idx_all_ranks=self.tgt_idx_all_ranks)
 
diff --git a/boxtree/fmm.py b/boxtree/fmm.py
index 760e67bd..3ac604c5 100644
--- a/boxtree/fmm.py
+++ b/boxtree/fmm.py
@@ -74,13 +74,15 @@ class ExpansionWranglerInterface(ABC):
         :class:`TreeIndependentDataForWrangler` exists to hold data that
         is more broadly reusable.
 
-    Functions that support returning timing data return a value supporting the
-    :class:`~boxtree.timing.TimingFuture` interface.
-
     .. versionchanged:: 2018.1
 
         Changed (a subset of) functions to return timing data.
 
+    .. versionchanged:: 2022.1
+
+        Removed timing data that should be handled by the
+        :class:`~arraycontext.ArrayContext`.
+
     .. attribute:: tree_indep
 
         An instance of (a typically wrangler-dependent subclass of)
@@ -160,12 +162,10 @@ def form_multipoles(self,
             actx: PyOpenCLArrayContext,
             level_start_source_box_nrs, source_boxes,
             src_weight_vecs):
-        """Return an expansions array
-        containing multipole expansions in *source_boxes* due to sources
-        with *src_weight_vecs*.
-        All other expansions must be zero.
-
-        :return: A pair (*mpoles*, *timing_future*).
+        """
+        :returns: an expansions array containing multipole expansions in
+            *source_boxes* due to sources with *src_weight_vecs*.
+            All other expansions must be zero.
         """
 
     @abstractmethod
@@ -173,12 +173,11 @@ def coarsen_multipoles(self,
             actx: PyOpenCLArrayContext,
             level_start_source_parent_box_nrs,
             source_parent_boxes, mpoles):
-        """For each box in *source_parent_boxes*,
-        gather (and translate) the box's children's multipole expansions in
-        *mpole* and add the resulting expansion into the box's multipole
-        expansion in *mpole*.
+        """For each box in *source_parent_boxes*, gather (and translate) the
+        box's children's multipole expansions in *mpoles* and add the
+        resulting expansion into the box's multipole expansion in *mpoles*.
 
-        :returns: A pair (*mpoles*, *timing_future*).
+        :returns: the updated *mpoles*.
         """
 
     @abstractmethod
@@ -190,8 +189,7 @@ def eval_direct(self,
         neighbor sources due to *src_weight_vecs*, which use :ref:`csr` and are
         indexed like *target_boxes*.
 
-        :returns: A pair (*pot*, *timing_future*), where *pot* is a
-            a new potential array.
+        :returns: a new potential array.
         """
 
     @abstractmethod
@@ -205,8 +203,7 @@ def multipole_to_local(self,
         array of local expansions.  *starts* and *lists* use :ref:`csr`, and
         *starts* is indexed like *target_or_target_parent_boxes*.
 
-        :returns: A pair (*pot*, *timing_future*) where *pot* is
-            a new (local) expansion array.
+        :returns: a new (local) expansion array.
         """
 
     @abstractmethod
@@ -219,8 +216,7 @@ def eval_multipoles(self,
         *starts* and *lists* in *from_sep_smaller_by_level[i]* use :ref:`csr`
         and *starts* is indexed like *target_boxes_by_source_level[i]*.
 
-        :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
-            array.
+        :returns: a new potential array.
         """
 
     @abstractmethod
@@ -234,8 +230,7 @@ def form_locals(self,
         use :ref:`csr` and *starts* is indexed like
         *target_or_target_parent_boxes*.
 
-        :returns: A pair (*pot*, *timing_future*) where *pot* is a new
-            local expansion array.
+        :returns: a new local expansion array.
         """
 
     @abstractmethod
@@ -247,7 +242,7 @@ def refine_locals(self,
         translate the box's parent's local expansion in *local_exps* and add
         the resulting expansion into the box's local expansion in *local_exps*.
 
-        :returns: A pair (*local_exps*, *timing_future*).
+        :returns: an updated local expansion array *local_exps*.
         """
 
     @abstractmethod
@@ -257,8 +252,7 @@ def eval_locals(self,
         """For each box in *target_boxes*, evaluate the local expansion in
         *local_exps* and return a new potential array.
 
-        :returns: A pair (*pot*, *timing_future*) where *pot* is a new potential
-            array.
+        :returns: a new potential array.
         """
 
     # }}}
@@ -343,7 +337,6 @@ def communicate_mpoles(self,                # noqa: B027
 def drive_fmm(actx: PyOpenCLArrayContext,
               wrangler: ExpansionWranglerInterface,
               src_weight_vecs, *,
-              timing_data=None,
               global_src_idx_all_ranks=None,
               global_tgt_idx_all_ranks=None):
     """Top-level driver routine for a fast multipole calculation.
@@ -364,9 +357,6 @@ def drive_fmm(actx: PyOpenCLArrayContext,
         Passed unmodified to *expansion_wrangler*. For distributed
         implementation, this argument is only significant on the root rank, but
         worker ranks still need to supply a dummy vector.
-    :arg timing_data: Either *None*, or a :class:`dict` that is populated with
-        timing information for the stages of the algorithm (in the form of
-        :class:`~boxtree.timing.TimingResult`), if such information is available.
     :arg global_src_idx_all_ranks: Only used in the distributed implementation. A
         :class:`list` of length ``nranks``, where the i-th entry is a
         :class:`numpy.ndarray` representing the global indices of sources in the
@@ -388,9 +378,7 @@ def drive_fmm(actx: PyOpenCLArrayContext,
     # Interface guidelines: Attributes of the tree are assumed to be known
     # to the expansion wrangler and should not be passed.
 
-    from boxtree.timing import TimingRecorder
     fmm_proc = ProcessLogger(logger, "fmm")
-    recorder = TimingRecorder()
 
     src_weight_vecs = [
             wrangler.reorder_sources(weight) for weight in src_weight_vecs]
@@ -401,26 +389,22 @@ def drive_fmm(actx: PyOpenCLArrayContext,
 
     # {{{ "Step 2.1:" Construct local multipoles
 
-    mpole_exps, timing_future = wrangler.form_multipoles(
+    mpole_exps = wrangler.form_multipoles(
             actx,
             traversal.level_start_source_box_nrs,
             traversal.source_boxes,
             src_weight_vecs)
 
-    recorder.add("form_multipoles", timing_future)
-
     # }}}
 
     # {{{ "Step 2.2:" Propagate multipoles upward
 
-    mpole_exps, timing_future = wrangler.coarsen_multipoles(
+    mpole_exps = wrangler.coarsen_multipoles(
             actx,
             traversal.level_start_source_parent_box_nrs,
             traversal.source_parent_boxes,
             mpole_exps)
 
-    recorder.add("coarsen_multipoles", timing_future)
-
     # mpole_exps is called Phi in [1]
 
     # }}}
@@ -429,22 +413,20 @@ def drive_fmm(actx: PyOpenCLArrayContext,
 
     # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")
 
-    potentials, timing_future = wrangler.eval_direct(
+    potentials = wrangler.eval_direct(
             actx,
             traversal.target_boxes,
             traversal.neighbor_source_boxes_starts,
             traversal.neighbor_source_boxes_lists,
             src_weight_vecs)
 
-    recorder.add("eval_direct", timing_future)
-
     # these potentials are called alpha in [1]
 
     # }}}
 
     # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local
 
-    local_exps, timing_future = wrangler.multipole_to_local(
+    local_exps = wrangler.multipole_to_local(
             actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
@@ -452,8 +434,6 @@ def drive_fmm(actx: PyOpenCLArrayContext,
             traversal.from_sep_siblings_lists,
             mpole_exps)
 
-    recorder.add("multipole_to_local", timing_future)
-
     # local_exps represents both Gamma and Delta in [1]
 
     # }}}
@@ -463,14 +443,12 @@ def drive_fmm(actx: PyOpenCLArrayContext,
     # (the point of aiming this stage at particles is specifically to keep its
     # contribution *out* of the downward-propagating local expansions)
 
-    mpole_result, timing_future = wrangler.eval_multipoles(
+    mpole_result = wrangler.eval_multipoles(
             actx,
             traversal.target_boxes_sep_smaller_by_source_level,
             traversal.from_sep_smaller_by_level,
             mpole_exps)
 
-    recorder.add("eval_multipoles", timing_future)
-
     potentials = potentials + mpole_result
 
     # these potentials are called beta in [1]
@@ -479,22 +457,20 @@ def drive_fmm(actx: PyOpenCLArrayContext,
         logger.debug("evaluate separated close smaller interactions directly "
                 "('list 3 close')")
 
-        direct_result, timing_future = wrangler.eval_direct(
+        direct_result = wrangler.eval_direct(
                 actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_smaller_starts,
                 traversal.from_sep_close_smaller_lists,
                 src_weight_vecs)
 
-        recorder.add("eval_direct", timing_future)
-
         potentials = potentials + direct_result
 
     # }}}
 
     # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")
 
-    local_result, timing_future = wrangler.form_locals(
+    local_result = wrangler.form_locals(
             actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
@@ -502,46 +478,38 @@ def drive_fmm(actx: PyOpenCLArrayContext,
             traversal.from_sep_bigger_lists,
             src_weight_vecs)
 
-    recorder.add("form_locals", timing_future)
-
     local_exps = local_exps + local_result
 
     if traversal.from_sep_close_bigger_starts is not None:
-        direct_result, timing_future = wrangler.eval_direct(
+        direct_result = wrangler.eval_direct(
                 actx,
                 traversal.target_boxes,
                 traversal.from_sep_close_bigger_starts,
                 traversal.from_sep_close_bigger_lists,
                 src_weight_vecs)
 
-        recorder.add("eval_direct", timing_future)
-
         potentials = potentials + direct_result
 
     # }}}
 
     # {{{ "Stage 7:" propagate local_exps downward
 
-    local_exps, timing_future = wrangler.refine_locals(
+    local_exps = wrangler.refine_locals(
             actx,
             traversal.level_start_target_or_target_parent_box_nrs,
             traversal.target_or_target_parent_boxes,
             local_exps)
 
-    recorder.add("refine_locals", timing_future)
-
     # }}}
 
     # {{{ "Stage 8:" evaluate locals
 
-    local_result, timing_future = wrangler.eval_locals(
+    local_result = wrangler.eval_locals(
             actx,
             traversal.level_start_target_box_nrs,
             traversal.target_boxes,
             local_exps)
 
-    recorder.add("eval_locals", timing_future)
-
     potentials = potentials + local_result
 
     # }}}
@@ -556,9 +524,6 @@ def drive_fmm(actx: PyOpenCLArrayContext,
 
     fmm_proc.done()
 
-    if timing_data is not None:
-        timing_data.update(recorder.summarize())
-
     return result
 
 
diff --git a/boxtree/pyfmmlib_integration.py b/boxtree/pyfmmlib_integration.py
index 4f0ce75c..a9764a7f 100644
--- a/boxtree/pyfmmlib_integration.py
+++ b/boxtree/pyfmmlib_integration.py
@@ -44,7 +44,6 @@
 
 from boxtree.array_context import PyOpenCLArrayContext
 from boxtree.fmm import ExpansionWranglerInterface, TreeIndependentDataForWrangler
-from boxtree.timing import return_timing_data
 
 
 logger = logging.getLogger(__name__)
@@ -271,11 +270,7 @@ def wrapper(*args, **kwargs):
 
 class FMMLibExpansionWrangler(ExpansionWranglerInterface):
     """Implements the :class:`boxtree.fmm.ExpansionWranglerInterface`
-    by using pyfmmlib.
-
-    Timing results returned by this wrangler contains the values *wall_elapsed*
-    and (optionally, if supported) *process_elapsed*, which measure wall time
-    and process time in seconds, respectively.
+    by using ``pyfmmlib``.
     """
 
     # {{{ constructor
@@ -675,7 +670,6 @@ def reorder_potentials(self, potentials):
         return potentials[self.tree.sorted_target_ids]
 
     @log_process(logger)
-    @return_timing_data
     def form_multipoles(self, actx: PyOpenCLArrayContext,
             level_start_source_box_nrs,
             source_boxes,
@@ -720,7 +714,6 @@ def form_multipoles(self, actx: PyOpenCLArrayContext,
         return mpoles
 
     @log_process(logger)
-    @return_timing_data
     def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
             level_start_source_parent_box_nrs,
             source_parent_boxes,
@@ -778,7 +771,6 @@ def coarsen_multipoles(self, actx: PyOpenCLArrayContext,
         return mpoles
 
     @log_process(logger)
-    @return_timing_data
     def eval_direct(self, actx: PyOpenCLArrayContext,
             target_boxes,
             neighbor_sources_starts,
@@ -825,7 +817,6 @@ def eval_direct(self, actx: PyOpenCLArrayContext,
         return output
 
     @log_process(logger)
-    @return_timing_data
     def multipole_to_local(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
@@ -940,7 +931,6 @@ def multipole_to_local(self, actx: PyOpenCLArrayContext,
         return local_exps
 
     @log_process(logger)
-    @return_timing_data
     def eval_multipoles(self, actx: PyOpenCLArrayContext,
             target_boxes_by_source_level,
             sep_smaller_nonsiblings_by_level,
@@ -984,7 +974,6 @@ def eval_multipoles(self, actx: PyOpenCLArrayContext,
         return output
 
     @log_process(logger)
-    @return_timing_data
     def form_locals(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
@@ -1065,7 +1054,6 @@ def form_locals(self, actx: PyOpenCLArrayContext,
         return local_exps
 
     @log_process(logger)
-    @return_timing_data
     def refine_locals(self, actx: PyOpenCLArrayContext,
             level_start_target_or_target_parent_box_nrs,
             target_or_target_parent_boxes,
@@ -1114,7 +1102,6 @@ def refine_locals(self, actx: PyOpenCLArrayContext,
         return local_exps
 
     @log_process(logger)
-    @return_timing_data
     def eval_locals(self, actx: PyOpenCLArrayContext,
             level_start_target_box_nrs,
             target_boxes,
diff --git a/boxtree/timing.py b/boxtree/timing.py
deleted file mode 100644
index e3bad59b..00000000
--- a/boxtree/timing.py
+++ /dev/null
@@ -1,171 +0,0 @@
-"""
-.. autoclass:: TimingResult
-
-.. autoclass:: TimingFuture
-"""
-
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from collections.abc import Mapping
-
-
-# {{{ timing result
-
-class TimingResult(Mapping):
-    """Interface for returned timing data.
-
-    This supports accessing timing results via a mapping interface, along with
-    combining results via :meth:`merge`.
-
-    .. automethod:: merge
-    """
-
-    def __init__(self, *args, **kwargs):
-        """See constructor for :class:`dict`."""
-        self._mapping = dict(*args, **kwargs)
-
-    def __getitem__(self, key):
-        return self._mapping[key]
-
-    def __iter__(self):
-        return iter(self._mapping)
-
-    def __len__(self):
-        return len(self._mapping)
-
-    def merge(self, other):
-        """Merge this result with another by adding together common fields."""
-        result = {}
-
-        for key in self:
-            val = self.get(key)
-            other_val = other.get(key)
-
-            if val is None or other_val is None:
-                continue
-
-            result[key] = val + other_val
-
-        return type(self)(result)
-
-# }}}
-
-
-# {{{ timing future
-
-class TimingFuture:
-    """Returns timing data for a potentially asynchronous operation.
-
-    .. automethod:: result
-    .. automethod:: done
-    """
-
-    def result(self):
-        """Return a :class:`TimingResult`. May block."""
-        raise NotImplementedError
-
-    def done(self):
-        """Return *True* if the operation is complete."""
-        raise NotImplementedError
-
-# }}}
-
-
-# {{{ timing recorder
-
-class TimingRecorder:
-
-    def __init__(self):
-        from collections import defaultdict
-        self.futures = defaultdict(list)
-
-    def add(self, description, future):
-        self.futures[description].append(future)
-
-    def summarize(self):
-        result = {}
-
-        for description, futures_list in self.futures.items():
-            futures = iter(futures_list)
-
-            timing_result = next(futures).result()
-            for future in futures:
-                timing_result = timing_result.merge(future.result())
-
-            result[description] = timing_result
-
-        return result
-
-# }}}
-
-
-# {{{ time recording tool
-
-class DummyTimingFuture(TimingFuture):
-    @classmethod
-    def from_timer(cls, timer):
-        return cls(wall_elapsed=timer.wall_elapsed,
-                   process_elapsed=timer.process_elapsed)
-
-    @classmethod
-    def from_op_count(cls, op_count):
-        return cls(ops_elapsed=op_count)
-
-    def __init__(self, *args, **kwargs):
-        self._result = TimingResult(*args, **kwargs)
-
-    def result(self):
-        return self._result
-
-    def done(self):
-        return True
-
-
-def return_timing_data(wrapped):
-    """A decorator for recording timing data for a function call.
-
-    The decorated function returns a tuple (*retval*, *timing_future*)
-    where *retval* is the original return value and *timing_future*
-    supports the timing data future interface in :mod:`boxtree.fmm`.
-    """
-
-    from pytools import ProcessTimer
-
-    def wrapper(*args, **kwargs):
-        timer = ProcessTimer()
-        retval = wrapped(*args, **kwargs)
-        timer.done()
-
-        future = DummyTimingFuture.from_timer(timer)
-        return (retval, future)
-
-    from functools import update_wrapper
-    new_wrapper = update_wrapper(wrapper, wrapped)
-
-    return new_wrapper
-
-# }}}
-
-
-# vim: foldmethod=marker
diff --git a/doc/misc.rst b/doc/misc.rst
index 37f83588..ebfaa9f3 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -27,13 +27,24 @@ For development, you may want to install in `editable mode
 User-visible Changes
 ====================
 
-Version 2019.1
+.. note::
+
+    You can get snapshots of in-development versions from
+    :mod:`boxtree`'s `git repository <https://github.com/inducer/boxtree>`_.
+
+Version 2024.1
 --------------
 
-.. note::
+* Use :mod:`arraycontext` as the main array abstraction (over :mod:`pyopencl`
+  only at the moment). This changed the API of many functions and classes,
+  since most of them now take an :class:`~arraycontext.ArrayContext` instead
+  of a :class:`pyopencl.Context`.
+* Remove (temporarily) cost model support. This removed the *timing_data*
+  parameter and return values from the FMM driver.
+* Removed *DeviceDataRecord* in favour of array containers from :mod:`arraycontext`.
 
-    This version is currently under development. You can get snapshots from
-    boxtree's `git repository <https://github.com/inducer/boxtree>`__
+Version 2019.1
+--------------
 
 * Faster M2Ls in the FMMLIB backend using precomputed rotation matrices.  This
   change adds an optional *rotation_data* parameter to the FMMLIB geometry wrangler
diff --git a/doc/tools.rst b/doc/tools.rst
index 0b5225ee..fd3fc963 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -1,8 +1,6 @@
 Utility Functionality
 =====================
 
-.. automodule:: boxtree.timing
-
 .. automodule:: boxtree.constant_one
 
 .. automodule:: boxtree.array_context
diff --git a/examples/cost_model.py b/examples/cost_model.py
index 8328672f..87565918 100644
--- a/examples/cost_model.py
+++ b/examples/cost_model.py
@@ -76,12 +76,9 @@ def fmm_level_to_order(tree, ilevel):
                 fmm_level_to_order=fmm_level_to_order)
         level_orders_list.append(wrangler.level_orders)
 
-        timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
-
-        timing_results.append(timing_data)
+        drive_fmm(actx, wrangler, (src_weights,))
 
     time_field_name = "process_elapsed"
 
@@ -99,6 +96,9 @@ def fmm_level_to_order(tree, ilevel):
         )
     queue.finish()
 
+    if not timing_results:
+        return
+
     params = cost_model.estimate_calibration_params(
         model_results, timing_results[:-1], time_field_name=time_field_name
     )
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index f3b53980..ec5f1d75 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -375,6 +375,7 @@ def test_compare_cl_and_py_cost_model(actx_factory, nsources, ntargets, dims, dt
 # {{{ test_estimate_calibration_params
 
 @pytest.mark.opencl
+@pytest.mark.skip(reason="cost model is not functional")
 def test_estimate_calibration_params(actx_factory):
     from boxtree.pyfmmlib_integration import (
         FMMLibExpansionWrangler,
@@ -436,12 +437,9 @@ def fmm_level_to_order(tree, ilevel):
                 fmm_level_to_order=fmm_level_to_order)
         level_to_orders.append(wrangler.level_orders)
 
-        timing_data = {}
         from boxtree.fmm import drive_fmm
         src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-        drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
-
-        timing_results.append(timing_data)
+        drive_fmm(actx, wrangler, (src_weights,))
 
     time_field_name = "process_elapsed"
 
@@ -458,7 +456,6 @@ def test_params_equal(test_params1, test_params2):
             assert test_params1[name] == test_params2[name]
 
     python_cost_model = _PythonFMMCostModel(make_pde_aware_translation_cost_model)
-
     python_model_results = []
 
     for icase in range(len(traversals)-1):
@@ -563,10 +560,10 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
     tree_indep = ConstantOneTreeIndependentDataForWrangler()
     wrangler = ConstantOneExpansionWrangler(tree_indep, trav)
 
-    timing_data = {}
     from boxtree.fmm import drive_fmm
+    timing_data = {}
     src_weights = rng.random(size=tree.nsources, dtype=tree.coord_dtype)
-    drive_fmm(actx, wrangler, (src_weights,), timing_data=timing_data)
+    drive_fmm(actx, wrangler, (src_weights,))
 
     cost_model = FMMCostModel(
         translation_cost_model_factory=OpCountingTranslationCostModel
@@ -579,6 +576,9 @@ def test_cost_model_op_counts_agree_with_constantone_wrangler(
         FMMCostModel.get_unit_calibration_params(),
     )
 
+    if not timing_data:
+        return
+
     mismatches = []
     for stage in timing_data:
         if timing_data[stage]["ops_elapsed"] != modeled_time[stage]:
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 45da4bfc..5faa8dde 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -138,21 +138,7 @@ def wrangler_factory(local_traversal, global_traversal):
         distributed_fmm_info = DistributedFMMRunner(
             actx, global_tree_host, tg, wrangler_factory, comm=comm)
 
-        timing_data = {}
-        pot_dfmm = distributed_fmm_info.drive_dfmm(
-            actx, [sources_weights], timing_data=timing_data)
-        assert timing_data
-
-    # Uncomment the following section to print the time taken of each stage
-    """
-    if rank == 1:
-        from pytools import Table
-        table = Table()
-        table.add_row(["stage", "time (s)"])
-        for stage in timing_data:
-            table.add_row([stage, "%.2f" % timing_data[stage]["wall_elapsed"]])
-        print(table)
-    """
+        pot_dfmm = distributed_fmm_info.drive_dfmm(actx, [sources_weights])
 
     if rank == 0:
         error = (la.norm(pot_fmm - pot_dfmm * 2 * np.pi, ord=np.inf)
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 460e56fb..7e4d0896 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -459,10 +459,7 @@ def fmm_level_to_order(tree, lev):
 
     from boxtree.fmm import drive_fmm
 
-    timing_data = {}
-    pot = drive_fmm(actx, wrangler, (weights,), timing_data=timing_data)
-    print(timing_data)
-    assert timing_data
+    pot = drive_fmm(actx, wrangler, (weights,))
 
     # {{{ ref fmmlib computation
 
@@ -788,21 +785,8 @@ def fmm_level_to_order(tree, lev):
 
     from boxtree.fmm import drive_fmm
 
-    baseline_timing_data = {}
-    baseline_pot = drive_fmm(
-        actx, baseline_wrangler, (weights,), timing_data=baseline_timing_data)
-
-    optimized_timing_data = {}
-    optimized_pot = drive_fmm(
-        actx, optimized_wrangler, (weights,), timing_data=optimized_timing_data)
-
-    baseline_time = baseline_timing_data["multipole_to_local"]["process_elapsed"]
-    if baseline_time is not None:
-        print(f"Baseline M2L time : {baseline_time:#.4g} s")
-
-    opt_time = optimized_timing_data["multipole_to_local"]["process_elapsed"]
-    if opt_time is not None:
-        print(f"Optimized M2L time: {opt_time:#.4g} s")
+    baseline_pot = drive_fmm(actx, baseline_wrangler, (weights,))
+    optimized_pot = drive_fmm(actx, optimized_wrangler, (weights,))
 
     assert np.allclose(baseline_pot, optimized_pot, atol=1e-13, rtol=1e-13)
 

From 5b14fe16c914ae641abce1b6823cee4aa21e78cb Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 19 Jul 2024 21:18:23 +0300
Subject: [PATCH 26/28] ruff: mark arraycontext as first-party

---
 boxtree/area_query.py             | 2 +-
 boxtree/array_context.py          | 2 +-
 boxtree/distributed/local_tree.py | 2 +-
 boxtree/distributed/partition.py  | 2 +-
 boxtree/rotation_classes.py       | 2 +-
 boxtree/translation_classes.py    | 2 +-
 boxtree/traversal.py              | 2 +-
 boxtree/tree.py                   | 2 +-
 pyproject.toml                    | 9 +++++----
 test/test_cost_model.py           | 1 +
 test/test_distributed.py          | 1 +
 test/test_fmm.py                  | 1 +
 test/test_tools.py                | 1 +
 test/test_traversal.py            | 1 +
 test/test_tree.py                 | 1 +
 test/test_tree_of_boxes.py        | 1 +
 16 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/boxtree/area_query.py b/boxtree/area_query.py
index 23f78ba2..0654b6f9 100644
--- a/boxtree/area_query.py
+++ b/boxtree/area_query.py
@@ -28,9 +28,9 @@
 from functools import partial
 
 import numpy as np
-from arraycontext import Array
 from mako.template import Template
 
+from arraycontext import Array
 from pyopencl.elementwise import ElementwiseTemplate
 from pytools import ProcessLogger, memoize_method
 
diff --git a/boxtree/array_context.py b/boxtree/array_context.py
index 5fe85c5c..2b0779eb 100644
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
@@ -21,6 +21,7 @@
 """
 
 import numpy as np
+
 from arraycontext import (  # noqa: F401
     PyOpenCLArrayContext as PyOpenCLArrayContextBase,
     deserialize_container,
@@ -32,7 +33,6 @@
     _PytestPyOpenCLArrayContextFactoryWithClass,
     register_pytest_array_context_factory,
 )
-
 from pyopencl.algorithm import BuiltList
 
 
diff --git a/boxtree/distributed/local_tree.py b/boxtree/distributed/local_tree.py
index 1a5bb1a6..b1852a83 100644
--- a/boxtree/distributed/local_tree.py
+++ b/boxtree/distributed/local_tree.py
@@ -26,9 +26,9 @@
 from dataclasses import dataclass
 
 import numpy as np
-from arraycontext import Array, ArrayOrContainer
 from mako.template import Template
 
+from arraycontext import Array, ArrayOrContainer
 from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
 from pytools import memoize_method
diff --git a/boxtree/distributed/partition.py b/boxtree/distributed/partition.py
index 95f40037..32054142 100644
--- a/boxtree/distributed/partition.py
+++ b/boxtree/distributed/partition.py
@@ -24,9 +24,9 @@
 from dataclasses import dataclass
 
 import numpy as np
-from arraycontext import Array
 from mako.template import Template
 
+from arraycontext import Array
 from pyopencl.elementwise import ElementwiseKernel
 from pyopencl.tools import dtype_to_ctype
 from pytools import memoize_method
diff --git a/boxtree/rotation_classes.py b/boxtree/rotation_classes.py
index 43e1b759..0ecd2c40 100644
--- a/boxtree/rotation_classes.py
+++ b/boxtree/rotation_classes.py
@@ -36,8 +36,8 @@
 from dataclasses import dataclass
 
 import numpy as np
-from arraycontext import Array
 
+from arraycontext import Array
 from pytools import log_process
 
 from boxtree.array_context import PyOpenCLArrayContext, dataclass_array_container
diff --git a/boxtree/translation_classes.py b/boxtree/translation_classes.py
index 073a4a9c..301eb0d7 100644
--- a/boxtree/translation_classes.py
+++ b/boxtree/translation_classes.py
@@ -37,9 +37,9 @@
 from functools import partial
 
 import numpy as np
-from arraycontext import Array
 from mako.template import Template
 
+from arraycontext import Array
 from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate
 from pytools import memoize_method
 
diff --git a/boxtree/traversal.py b/boxtree/traversal.py
index 5bec4015..3e9b62b9 100644
--- a/boxtree/traversal.py
+++ b/boxtree/traversal.py
@@ -40,9 +40,9 @@
 from functools import partial
 
 import numpy as np
-from arraycontext import Array
 from mako.template import Template
 
+from arraycontext import Array
 from pyopencl.algorithm import ListOfListsBuilder
 from pyopencl.elementwise import ElementwiseKernel, ElementwiseTemplate
 from pytools import ProcessLogger, log_process, memoize_method
diff --git a/boxtree/tree.py b/boxtree/tree.py
index f4a97d1f..a92379cc 100644
--- a/boxtree/tree.py
+++ b/boxtree/tree.py
@@ -81,8 +81,8 @@
 from functools import cached_property
 
 import numpy as np
-from arraycontext import Array
 
+from arraycontext import Array
 from cgen import Enum
 from pytools import memoize_method
 
diff --git a/pyproject.toml b/pyproject.toml
index a9c478d8..08c7f279 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,13 +96,14 @@ multiline-quotes = "double"
 [tool.ruff.lint.isort]
 combine-as-imports = true
 known-first-party = [
-    "pytools",
-    "pymbolic",
+    "arraycontext",
+    "cgen",
     "loopy",
-    "pyopencl",
     "meshmode",
     "modepy",
-    "cgen"
+    "pymbolic",
+    "pyopencl",
+    "pytools",
 ]
 known-local-folder = [
     "boxtree",
diff --git a/test/test_cost_model.py b/test/test_cost_model.py
index ec5f1d75..bafa92fc 100644
--- a/test/test_cost_model.py
+++ b/test/test_cost_model.py
@@ -30,6 +30,7 @@
 
 import numpy as np
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
diff --git a/test/test_distributed.py b/test/test_distributed.py
index 5faa8dde..de97b4f4 100644
--- a/test/test_distributed.py
+++ b/test/test_distributed.py
@@ -27,6 +27,7 @@
 import numpy as np
 import numpy.linalg as la
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import (
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 7e4d0896..7d9c7877 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -25,6 +25,7 @@
 import numpy as np
 import numpy.linalg as la
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import (
diff --git a/test/test_tools.py b/test/test_tools.py
index 16b65307..a19f465d 100644
--- a/test/test_tools.py
+++ b/test/test_tools.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import (
diff --git a/test/test_traversal.py b/test/test_traversal.py
index 6de51b36..e3b3f838 100644
--- a/test/test_traversal.py
+++ b/test/test_traversal.py
@@ -25,6 +25,7 @@
 import numpy as np
 import numpy.linalg as la
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import (
diff --git a/test/test_tree.py b/test/test_tree.py
index aa6076f3..59783642 100644
--- a/test/test_tree.py
+++ b/test/test_tree.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 from boxtree.array_context import PytestPyOpenCLArrayContextFactory, _acf  # noqa: F401
diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py
index b7e798a3..894fc9c7 100644
--- a/test/test_tree_of_boxes.py
+++ b/test/test_tree_of_boxes.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 import pytest
+
 from arraycontext import pytest_generate_tests_for_array_contexts
 
 # This means boxtree's tests have a hard dependency on meshmode. That's OK.

From 0daa49615d1cedea1b96b976f63a524b90ba969f Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Tue, 2 Aug 2022 10:23:55 +0300
Subject: [PATCH 27/28] point ci to modified downstreams

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index affa894c..3ec5ce20 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -89,8 +89,8 @@ jobs:
                 curl -L -O https://tiker.net/ci-support-v0
                 . ci-support-v0
 
-                if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "rename-nterms" ]]; then
-                   DOWNSTREAM_PROJECT=https://github.com/gaohao95/pytential.git@rename-nterms
+                if [[ "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then
+                   DOWNSTREAM_PROJECT=https://github.com/alexfikl/${DOWNSTREAM_PROJECT}.git@towards-array-context
                 fi
                 test_downstream "$DOWNSTREAM_PROJECT"
 

From 36edb032d7c6a38813528c6db0ed7fa2a5b9421c Mon Sep 17 00:00:00 2001
From: Alexandru Fikl <alexfikl@gmail.com>
Date: Fri, 19 Jul 2024 22:09:07 +0300
Subject: [PATCH 28/28] test: wrap meshmode array context

---
 boxtree/array_context.py   | 73 +++++++++++++++++++++-----------------
 test/test_tree_of_boxes.py | 23 ++++++++++--
 2 files changed, 61 insertions(+), 35 deletions(-)

diff --git a/boxtree/array_context.py b/boxtree/array_context.py
index 2b0779eb..8fdce077 100644
--- a/boxtree/array_context.py
+++ b/boxtree/array_context.py
@@ -43,6 +43,42 @@
 
 # {{{ array context
 
+def _boxtree_rec_map_container(actx, func, array, allowed_types=None, *,
+                               default_scalar=None, strict=False):
+    import arraycontext.impl.pyopencl.taggable_cl_array as tga
+
+    if allowed_types is None:
+        allowed_types = (tga.TaggableCLArray,)
+
+    def _wrapper(ary):
+        # NOTE: this is copied verbatim from arraycontext and this is the
+        # only change to allow optional fields inside containers
+        if ary is None:
+            return ary
+
+        if isinstance(ary, allowed_types):
+            return func(ary)
+        elif not strict and isinstance(ary, actx.array_types):
+            from warnings import warn
+            warn(f"Invoking {type(actx).__name__}.{func.__name__[1:]} with "
+                f"{type(ary).__name__} will be unsupported in 2025. Use "
+                "'to_tagged_cl_array' to convert instances to TaggableCLArray.",
+                DeprecationWarning, stacklevel=2)
+            return func(tga.to_tagged_cl_array(ary))
+        elif np.isscalar(ary):
+            if default_scalar is None:
+                return ary
+            else:
+                return np.array(ary).dtype.type(default_scalar)
+        else:
+            raise TypeError(
+                f"{type(actx).__name__}.{func.__name__[1:]} invoked with "
+                f"an unsupported array type: got '{type(ary).__name__}', "
+                f"but expected one of {allowed_types}")
+
+    return rec_map_array_container(_wrapper, array)
+
+
 class PyOpenCLArrayContext(PyOpenCLArrayContextBase):
     def transform_loopy_program(self, t_unit):
         default_ep = t_unit.default_entrypoint
@@ -61,38 +97,11 @@ def transform_loopy_program(self, t_unit):
 
     def _rec_map_container(self, func, array, allowed_types=None, *,
             default_scalar=None, strict=False):
-        import arraycontext.impl.pyopencl.taggable_cl_array as tga
-
-        if allowed_types is None:
-            allowed_types = (tga.TaggableCLArray,)
-
-        def _wrapper(ary):
-            # NOTE: this is copied verbatim from arraycontext and this is the
-            # only change to allow optional fields inside containers
-            if ary is None:
-                return ary
-
-            if isinstance(ary, allowed_types):
-                return func(ary)
-            elif not strict and isinstance(ary, self.array_types):
-                from warnings import warn
-                warn(f"Invoking {type(self).__name__}.{func.__name__[1:]} with "
-                    f"{type(ary).__name__} will be unsupported in 2025. Use "
-                    "'to_tagged_cl_array' to convert instances to TaggableCLArray.",
-                    DeprecationWarning, stacklevel=2)
-                return func(tga.to_tagged_cl_array(ary))
-            elif np.isscalar(ary):
-                if default_scalar is None:
-                    return ary
-                else:
-                    return np.array(ary).dtype.type(default_scalar)
-            else:
-                raise TypeError(
-                    f"{type(self).__name__}.{func.__name__[1:]} invoked with "
-                    f"an unsupported array type: got '{type(ary).__name__}', "
-                    f"but expected one of {allowed_types}")
-
-        return rec_map_array_container(_wrapper, array)
+        return _boxtree_rec_map_container(
+            self, func, array,
+            allowed_types=allowed_types,
+            default_scalar=default_scalar,
+            strict=strict)
 
 # }}}
 
diff --git a/test/test_tree_of_boxes.py b/test/test_tree_of_boxes.py
index 894fc9c7..b26f9770 100644
--- a/test/test_tree_of_boxes.py
+++ b/test/test_tree_of_boxes.py
@@ -30,7 +30,10 @@
 
 # This means boxtree's tests have a hard dependency on meshmode. That's OK.
 from meshmode import _acf  # noqa: F401
-from meshmode.array_context import PytestPyOpenCLArrayContextFactory
+from meshmode.array_context import (
+    PyOpenCLArrayContext,
+    PytestPyOpenCLArrayContextFactory,
+)
 
 from boxtree import (
     make_meshmode_mesh_from_leaves,
@@ -39,10 +42,24 @@
 )
 
 
-logger = logging.getLogger(__name__)
+class ArrayContext(PyOpenCLArrayContext):
+    def _rec_map_container(self, func, array, allowed_types=None, *,
+            default_scalar=None, strict=False):
+        from boxtree.array_context import _boxtree_rec_map_container
+        return _boxtree_rec_map_container(
+            self, func, array,
+            allowed_types=allowed_types,
+            default_scalar=default_scalar,
+            strict=strict)
+
 
+class ContextFactory(PytestPyOpenCLArrayContextFactory):
+    actx_class = ArrayContext
+
+
+logger = logging.getLogger(__name__)
 pytest_generate_tests = pytest_generate_tests_for_array_contexts([
-    PytestPyOpenCLArrayContextFactory,
+    ContextFactory,
     ])