diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 893a8b7d..66542a16 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -101,8 +101,8 @@ jobs: run: | curl -L -O https://tiker.net/ci-support-v0 . ./ci-support-v0 - if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "e2p" ]]; then - DOWNSTREAM_PROJECT=https://github.com/isuruf/pytential.git@e2p + if [[ "$DOWNSTREAM_PROJECT" == "pytential" && "$GITHUB_HEAD_REF" == "towards-array-context" ]]; then + DOWNSTREAM_PROJECT=https://github.com/alexfikl/pytential.git@towards-array-context fi test_downstream "$DOWNSTREAM_PROJECT" diff --git a/examples/curve-pot.py b/examples/curve-pot.py index db2c667c..2a7183dd 100644 --- a/examples/curve-pot.py +++ b/examples/curve-pot.py @@ -91,7 +91,7 @@ def draw_pot_figure(aspect_ratio, knl_kwargs = {} vol_source_knl, vol_target_knl = process_kernel(knl, what_operator) - p2p = P2P(actx.context, + p2p = P2P( source_kernels=(vol_source_knl,), target_kernels=(vol_target_knl,), exclude_self=False, @@ -100,7 +100,7 @@ def draw_pot_figure(aspect_ratio, lpot_source_knl, lpot_target_knl = process_kernel(knl, what_operator_lpot) from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, + lpot = LayerPotential( expansion=expn_class(knl, order=order), source_kernels=(lpot_source_knl,), target_kernels=(lpot_target_knl,), @@ -183,7 +183,8 @@ def map_to_curve(t): def apply_lpot(x): xovsmp = np.dot(fim, x) - evt, (y,) = lpot(actx.queue, + y, = lpot( + actx, sources, ovsmp_sources, actx.from_numpy(centers), @@ -208,7 +209,8 @@ def apply_lpot(x): density = np.cos(mode_nr*2*np.pi*native_t).astype(np.complex128) strength = actx.from_numpy(native_curve.speed * native_weights * density) - evt, (vol_pot,) = p2p(actx.queue, + vol_pot, = p2p( + actx, targets, sources, [strength], **volpot_kwargs) @@ -218,7 +220,8 @@ def apply_lpot(x): ovsmp_strength = actx.from_numpy( ovsmp_curve.speed * ovsmp_weights * ovsmp_density) - evt, (curve_pot,) = lpot(actx.queue, + curve_pot, = lpot( + actx, sources, ovsmp_sources, actx.from_numpy(centers), diff --git a/examples/expansion-toys.py b/examples/expansion-toys.py index 48647d0e..cb8f7165 100644 --- a/examples/expansion-toys.py +++ b/examples/expansion-toys.py @@ -23,7 +23,6 @@ def main(): actx = PyOpenCLArrayContext(queue, force_device_scalars=True) tctx = t.ToyContext( - actx.context, # LaplaceKernel(2), YukawaKernel(2), extra_kernel_kwargs={"lam": 5}, # HelmholtzKernel(2), extra_kernel_kwargs={"k": 0.3}, @@ -37,22 +36,22 @@ def main(): fp = FieldPlotter([3, 0], extent=8) if USE_MATPLOTLIB: - t.logplot(fp, pt_src, cmap="jet") + t.logplot(actx, fp, pt_src, cmap="jet") plt.colorbar() plt.show() - mexp = t.multipole_expand(pt_src, [0, 0], 5) - mexp2 = t.multipole_expand(mexp, [0, 0.25]) # noqa: F841 - lexp = t.local_expand(mexp, [3, 0]) - lexp2 = t.local_expand(lexp, [3, 1], 3) + mexp = t.multipole_expand(actx, pt_src, [0, 0], order=5) + mexp2 = t.multipole_expand(actx, mexp, [0, 0.25]) # noqa: F841 + lexp = t.local_expand(actx, mexp, [3, 0]) + lexp2 = t.local_expand(actx, lexp, [3, 1], order=3) # diff = mexp - pt_src # diff = mexp2 - pt_src diff = lexp2 - pt_src - print(t.l_inf(diff, 1.2, center=lexp2.center)) + print(t.l_inf(actx, diff, 1.2, center=lexp2.center)) if USE_MATPLOTLIB: - t.logplot(fp, diff, cmap="jet", vmin=-3, vmax=0) + t.logplot(actx, fp, diff, cmap="jet", vmin=-3, vmax=0) plt.colorbar() plt.show() diff --git a/requirements.txt b/requirements.txt index da6a8fd2..9ba95b40 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ git+https://github.com/inducer/pytools.git#egg=pytools git+https://github.com/inducer/pymbolic.git#egg=pymbolic git+https://github.com/inducer/islpy.git#egg=islpy git+https://github.com/inducer/pyopencl.git#egg=pyopencl -git+https://github.com/inducer/boxtree.git#egg=boxtree +git+https://github.com/alexfikl/boxtree.git@towards-array-context#egg=boxtree git+https://github.com/inducer/loopy.git#egg=loopy git+https://github.com/inducer/arraycontext.git#egg=arraycontext git+https://github.com/inducer/pyfmmlib.git#egg=pyfmmlib diff --git a/sumpy/__init__.py b/sumpy/__init__.py index bfcbf6d7..81092d94 100644 --- a/sumpy/__init__.py +++ b/sumpy/__init__.py @@ -24,7 +24,8 @@ from sumpy.p2p import P2P, P2PFromCSR from sumpy.p2e import P2EFromSingleBox, P2EFromCSR from sumpy.e2p import E2PFromSingleBox, E2PFromCSR -from sumpy.e2e import (E2EFromCSR, E2EFromChildren, E2EFromParent, +from sumpy.e2e import ( + E2EFromCSR, E2EFromChildren, E2EFromParent, M2LUsingTranslationClassesDependentData, M2LGenerateTranslationClassesDependentData, M2LPreprocessMultipole, M2LPostprocessLocal) @@ -41,7 +42,7 @@ "M2LPreprocessMultipole", "M2LPostprocessLocal"] -code_cache = WriteOncePersistentDict("sumpy-code-cache-v6-"+VERSION_TEXT) +code_cache = WriteOncePersistentDict(f"sumpy-code-cache-v6-{VERSION_TEXT}") # {{{ optimization control diff --git a/sumpy/array_context.py b/sumpy/array_context.py index 3c328849..82b186b8 100644 --- a/sumpy/array_context.py +++ b/sumpy/array_context.py @@ -20,33 +20,71 @@ THE SOFTWARE. """ +from typing import Any, Dict, List, Optional, Union + +import numpy as np + from boxtree.array_context import PyOpenCLArrayContext as PyOpenCLArrayContextBase from arraycontext.pytest import ( _PytestPyOpenCLArrayContextFactoryWithClass, register_pytest_array_context_factory) +from pytools.tag import ToTagSetConvertible __doc__ = """ Array Context ------------- +.. autofunction:: make_loopy_program .. autoclass:: PyOpenCLArrayContext """ # {{{ PyOpenCLArrayContext +def make_loopy_program( + domains, statements, + kernel_data: Optional[List[Any]] = None, *, + name: str = "sumpy_loopy_kernel", + silenced_warnings: Optional[Union[List[str], str]] = None, + assumptions: Optional[Union[List[str], str]] = None, + fixed_parameters: Optional[Dict[str, Any]] = None, + index_dtype: Optional["np.dtype"] = None, + tags: ToTagSetConvertible = None): + """Return a :class:`loopy.LoopKernel` suitable for use with + :meth:`arraycontext.ArrayContext.call_loopy`. + """ + if kernel_data is None: + kernel_data = [...] + + if silenced_warnings is None: + silenced_warnings = [] + + import loopy as lp + from arraycontext.loopy import _DEFAULT_LOOPY_OPTIONS + + return lp.make_kernel( + domains, + statements, + kernel_data=kernel_data, + options=_DEFAULT_LOOPY_OPTIONS, + default_offset=lp.auto, + name=name, + lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, + assumptions=assumptions, + fixed_parameters=fixed_parameters, + silenced_warnings=silenced_warnings, + index_dtype=index_dtype, + tags=tags) + + class PyOpenCLArrayContext(PyOpenCLArrayContextBase): def transform_loopy_program(self, t_unit): - default_ep = t_unit.default_entrypoint - options = default_ep.options + return t_unit - if not (options.return_dict and options.no_numpy): - raise ValueError("Loopy kernel passed to call_loopy must " - "have return_dict and no_numpy options set. " - "Did you use arraycontext.make_loopy_program " - "to create this kernel?") - return super().transform_loopy_program(t_unit) +def is_cl_cpu(actx: PyOpenCLArrayContext) -> bool: + import pyopencl as cl + return all(dev.type & cl.device_type.CPU for dev in actx.context.devices) # }}} diff --git a/sumpy/distributed.py b/sumpy/distributed.py index 1707e5e6..ac3bcefa 100644 --- a/sumpy/distributed.py +++ b/sumpy/distributed.py @@ -20,64 +20,72 @@ THE SOFTWARE. """ -from boxtree.distributed.calculation import DistributedExpansionWrangler +from boxtree.distributed.calculation import DistributedExpansionWranglerMixin + from sumpy.fmm import SumpyExpansionWrangler -import pyopencl as cl +from sumpy.array_context import PyOpenCLArrayContext class DistributedSumpyExpansionWrangler( - DistributedExpansionWrangler, SumpyExpansionWrangler): + DistributedExpansionWranglerMixin, SumpyExpansionWrangler): def __init__( - self, context, comm, tree_indep, local_traversal, global_traversal, + self, actx: PyOpenCLArrayContext, + comm, tree_indep, local_traversal, global_traversal, dtype, fmm_level_to_order, communicate_mpoles_via_allreduce=False, - **kwarg): - DistributedExpansionWrangler.__init__( - self, context, comm, global_traversal, True, - communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) + **kwargs): SumpyExpansionWrangler.__init__( - self, tree_indep, local_traversal, dtype, fmm_level_to_order, **kwarg) + self, tree_indep, local_traversal, dtype, fmm_level_to_order, + **kwargs) + + self.comm = comm + self.traversal_in_device_memory = True + self.global_traversal = global_traversal + self.communicate_mpoles_via_allreduce = communicate_mpoles_via_allreduce - def distribute_source_weights(self, src_weight_vecs, src_idx_all_ranks): - src_weight_vecs_host = [src_weight.get() for src_weight in src_weight_vecs] + def distribute_source_weights(self, + actx: PyOpenCLArrayContext, src_weight_vecs, src_idx_all_ranks): + src_weight_vecs_host = [ + actx.to_numpy(src_weight) for src_weight in src_weight_vecs + ] local_src_weight_vecs_host = super().distribute_source_weights( - src_weight_vecs_host, src_idx_all_ranks) + actx, src_weight_vecs_host, src_idx_all_ranks) local_src_weight_vecs_device = [ - cl.array.to_device(src_weight.queue, local_src_weight) - for local_src_weight, src_weight in - zip(local_src_weight_vecs_host, src_weight_vecs)] + actx.from_numpy(local_src_weight) + for local_src_weight in local_src_weight_vecs_host] return local_src_weight_vecs_device - def gather_potential_results(self, potentials, tgt_idx_all_ranks): - mpi_rank = self.comm.Get_rank() - - potentials_host_vec = [potentials_dev.get() for potentials_dev in potentials] + def gather_potential_results(self, + actx: PyOpenCLArrayContext, potentials, tgt_idx_all_ranks): + potentials_host_vec = [ + actx.to_numpy(potentials_dev) for potentials_dev in potentials + ] gathered_potentials_host_vec = [] for potentials_host in potentials_host_vec: gathered_potentials_host_vec.append( - super().gather_potential_results(potentials_host, tgt_idx_all_ranks)) + super().gather_potential_results( + actx, potentials_host, tgt_idx_all_ranks)) - if mpi_rank == 0: + if self.is_mpi_root: from pytools.obj_array import make_obj_array return make_obj_array([ - cl.array.to_device(potentials_dev.queue, gathered_potentials_host) - for gathered_potentials_host, potentials_dev in - zip(gathered_potentials_host_vec, potentials)]) + actx.from_numpy(gathered_potentials_host) + for gathered_potentials_host in gathered_potentials_host_vec + ]) else: return None def reorder_sources(self, source_array): - if self.comm.Get_rank() == 0: - return source_array.with_queue(source_array.queue)[ - self.global_traversal.tree.user_source_ids] + if self.is_mpi_root: + return source_array[self.global_traversal.tree.user_source_ids] else: return source_array def reorder_potentials(self, potentials): - if self.comm.Get_rank() == 0: + if self.is_mpi_root: from pytools.obj_array import obj_array_vectorize import numpy as np assert ( @@ -91,8 +99,9 @@ def reorder(x): else: return None - def communicate_mpoles(self, mpole_exps, return_stats=False): - mpole_exps_host = mpole_exps.get() - stats = super().communicate_mpoles(mpole_exps_host, return_stats) + def communicate_mpoles(self, + actx: PyOpenCLArrayContext, mpole_exps, return_stats=False): + mpole_exps_host = actx.to_numpy(mpole_exps) + stats = super().communicate_mpoles(actx, mpole_exps_host, return_stats) mpole_exps[:] = mpole_exps_host return stats diff --git a/sumpy/e2e.py b/sumpy/e2e.py index c66f9c02..af7d7642 100644 --- a/sumpy/e2e.py +++ b/sumpy/e2e.py @@ -24,13 +24,11 @@ import numpy as np import loopy as lp -import sumpy.symbolic as sym -import pymbolic -from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from sumpy.tools import KernelCacheMixin, to_complex_dtype -from sumpy.codegen import register_optimization_preambles from pytools import memoize_method +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program +from sumpy.codegen import register_optimization_preambles +from sumpy.tools import KernelCacheMixin, to_complex_dtype import logging logger = logging.getLogger(__name__) @@ -49,11 +47,10 @@ """ -# {{{ translation base class +# {{{ E2EBase: base class class E2EBase(KernelCacheMixin, ABC): - def __init__(self, ctx, src_expansion, tgt_expansion, - name=None, device=None): + def __init__(self, src_expansion, tgt_expansion, name=None): """ :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase` :arg strength_usage: A list of integers indicating which expression @@ -61,33 +58,23 @@ def __init__(self, ctx, src_expansion, tgt_expansion, number of strength arrays that need to be passed. Default: all kernels use the same strength. """ - - if device is None: - device = ctx.devices[0] + from sumpy.kernel import ( + TargetTransformationRemover, SourceTransformationRemover) + txr = TargetTransformationRemover() + sxr = SourceTransformationRemover() if src_expansion is tgt_expansion: - from sumpy.kernel import (TargetTransformationRemover, - SourceTransformationRemover) - tgt_expansion = src_expansion = src_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(src_expansion.kernel))) - + tgt_expansion = src_expansion = ( + src_expansion.with_kernel(sxr(txr(src_expansion.kernel)))) else: + src_expansion = ( + src_expansion.with_kernel(sxr(txr(src_expansion.kernel)))) + tgt_expansion = ( + tgt_expansion.with_kernel(sxr(txr(tgt_expansion.kernel)))) - from sumpy.kernel import (TargetTransformationRemover, - SourceTransformationRemover) - src_expansion = src_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(src_expansion.kernel))) - tgt_expansion = tgt_expansion.with_kernel( - SourceTransformationRemover()( - TargetTransformationRemover()(tgt_expansion.kernel))) - - self.context = ctx self.src_expansion = src_expansion self.tgt_expansion = tgt_expansion self.name = name or self.default_name - self.device = device if src_expansion.dim != tgt_expansion.dim: raise ValueError("source and target expansions must have " @@ -102,8 +89,8 @@ def default_name(self): @memoize_method def get_translation_loopy_insns(self): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + import sumpy.symbolic as sym + dvec = sym.make_sym_vector("d", self.dim) src_coeff_exprs = [ sym.Symbol(f"src_coeff{i}") @@ -132,11 +119,7 @@ def get_translation_loopy_insns(self): ) def get_cache_key(self): - return ( - type(self).__name__, - self.src_expansion, - self.tgt_expansion, - ) + return (type(self).__name__, self.src_expansion, self.tgt_expansion) @abstractmethod def get_kernel(self): @@ -153,25 +136,20 @@ def get_optimized_kernel(self): # }}} -# {{{ translation from "compressed sparse row"-like source box lists +# {{{ E2EFromCSR: translation from "compressed sparse row"-like source box lists class E2EFromCSR(E2EBase): """Implements translation from a "compressed sparse row"-like source box list. """ - def __init__(self, ctx, src_expansion, tgt_expansion, - name=None, device=None): - super().__init__(ctx, src_expansion, tgt_expansion, - name=name, device=device) - @property def default_name(self): return "e2e_from_csr" def get_translation_loopy_insns(self): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + import sumpy.symbolic as sym + dvec = sym.make_sym_vector("d", self.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") @@ -210,12 +188,11 @@ def get_kernel(self): # (same for itgt_box, tgt_ibox) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] @@ -244,7 +221,7 @@ def get_kernel(self): """ for coeffidx in range(ncoeff_tgt)] + [""" end """], - [ + kernel_data=[ lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.ValueArg("src_rscale,tgt_rscale", None), lp.GlobalArg("src_box_starts, src_box_lists", @@ -257,15 +234,13 @@ def get_kernel(self): shape=("nsrc_level_boxes", ncoeff_src), offset=lp.auto), lp.GlobalArg("tgt_expansions", None, shape=("ntgt_level_boxes", ncoeff_tgt), offset=lp.auto), - "..." + ... ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: @@ -285,7 +260,7 @@ def get_optimized_kernel(self): return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -301,12 +276,17 @@ def __call__(self, queue, **kwargs): tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) knl = self.get_cached_kernel_executor() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] +# }}} - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) +# {{{ M2LUsingTranslationClassesDependentData class M2LUsingTranslationClassesDependentData(E2EFromCSR): """Implements translation from a "compressed sparse row"-like source box @@ -318,8 +298,8 @@ def default_name(self): return "m2l_using_translation_classes_dependent_data" def get_translation_loopy_insns(self, result_dtype): - from sumpy.symbolic import make_sym_vector - dvec = make_sym_vector("d", self.dim) + import sumpy.symbolic as sym + dvec = sym.make_sym_vector("d", self.dim) src_rscale = sym.Symbol("src_rscale") tgt_rscale = sym.Symbol("tgt_rscale") @@ -377,11 +357,13 @@ def get_inner_loopy_kernel(self, result_dtype): ncoeff_src = len(self.src_expansion) ncoeff_tgt = len(self.tgt_expansion) + import pymbolic as prim + domains = [] insns = self.get_translation_loopy_insns(result_dtype) - tgt_coeffs = pymbolic.var("tgt_coeffs") + tgt_coeffs = prim.var("tgt_coeffs") for i in range(ncoeff_tgt): - expr = pymbolic.var(f"tgt_coeff{i}") + expr = prim.var(f"tgt_coeff{i}") insn = lp.Assignment(assignee=tgt_coeffs[i], expression=tgt_coeffs[i] + expr) insns.append(insn) @@ -424,13 +406,12 @@ def get_kernel(self, result_dtype): translation_knl = self.get_inner_loopy_kernel(result_dtype) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box d[idim] = m2l_translation_vectors[idim, \ @@ -589,7 +573,7 @@ def get_kernel(self, result_dtype): ) {id=update,dep=set_d} end """], - [ + kernel_data=[ lp.ValueArg("src_rscale", None), lp.GlobalArg("m2l_translation_classes_dependent_data", None, shape=("ntranslation_classes", @@ -600,16 +584,14 @@ def get_kernel(self, result_dtype): lp.ValueArg("ntranslation_classes", np.int32), lp.ValueArg("ntranslation_vectors", np.int32), lp.ValueArg("translation_classes_level_start", np.int32), - "..." + ... ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ntranslation_classes>=1", - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "m2l_translation_classes_dependent_ndata": ( m2l_translation_classes_dependent_ndata)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION ) for expr_knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: @@ -635,7 +617,7 @@ def get_optimized_kernel(self, result_dtype): return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg src_rscale: :arg translation_classes_level_start: @@ -653,13 +635,15 @@ def __call__(self, queue, **kwargs): result_dtype = m2l_translation_classes_dependent_data.dtype knl = self.get_cached_kernel_executor(result_dtype=result_dtype) + result = actx.call_loopy( + knl, + src_rscale=src_rscale, + m2l_translation_vectors=m2l_translation_vectors, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data), + **kwargs) - return knl(queue, - src_rscale=src_rscale, - m2l_translation_vectors=m2l_translation_vectors, - m2l_translation_classes_dependent_data=( - m2l_translation_classes_dependent_data), - **kwargs) + return result["m2l_translation_classes_dependent_data"] # }}} @@ -689,7 +673,7 @@ def get_kernel(self, result_dtype): result_dtype) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[isrc_box]: 0<=isrc_box tgt_ibox = target_boxes[itgt_box] @@ -917,7 +904,7 @@ def get_kernel(self): end end """], - [ + kernel_data=[ lp.GlobalArg("target_boxes", None, shape=lp.auto, offset=lp.auto), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), @@ -931,13 +918,13 @@ def get_kernel(self): lp.ValueArg("src_base_ibox,tgt_base_ibox", np.int32), lp.ValueArg("ntgt_level_boxes,nsrc_level_boxes", np.int32), lp.ValueArg("aligned_nboxes", np.int32), - "..." + ... ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", fixed_parameters={"dim": self.dim, "nchildren": 2**self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -948,7 +935,7 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -957,23 +944,25 @@ def __call__(self, queue, **kwargs): :arg tgt_rscale: :arg centers: """ - knl = self.get_cached_kernel_executor() - centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) + knl = self.get_cached_kernel_executor() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] # }}} -# {{{ translation from a box's parent +# {{{ E2EFromParent: translation from a box's parent class E2EFromParent(E2EBase): @property @@ -992,11 +981,10 @@ def get_kernel(self): # (same for itgt_box, tgt_ibox) from sumpy.tools import gather_loopy_arguments - loopy_knl = lp.make_kernel( - [ - "{[itgt_box]: 0<=itgt_box tgt_ibox = target_boxes[itgt_box] @@ -1023,7 +1011,7 @@ def get_kernel(self): """.format(i=i) for i in range(ncoeffs_tgt)] + [""" end """], - [ + kernel_data=[ lp.GlobalArg("target_boxes", None, shape=lp.auto, offset=lp.auto), lp.GlobalArg("centers", None, shape="dim, naligned_boxes"), @@ -1036,12 +1024,13 @@ def get_kernel(self): shape=("ntgt_level_boxes", ncoeffs_tgt), offset=lp.auto), lp.GlobalArg("src_expansions", None, shape=("nsrc_level_boxes", ncoeffs_src), offset=lp.auto), - "..." + ... ] + gather_loopy_arguments([self.src_expansion, self.tgt_expansion]), - name=self.name, assumptions="ntgt_boxes>=1", + name=self.name, + assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", fixed_parameters={"dim": self.dim, "nchildren": 2**self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) for knl in [self.src_expansion.kernel, self.tgt_expansion.kernel]: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -1052,7 +1041,7 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg src_expansions: :arg src_box_starts: @@ -1061,18 +1050,20 @@ def __call__(self, queue, **kwargs): :arg tgt_rscale: :arg centers: """ - knl = self.get_cached_kernel_executor() - centers = kwargs.pop("centers") # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. src_rscale = centers.dtype.type(kwargs.pop("src_rscale")) tgt_rscale = centers.dtype.type(kwargs.pop("tgt_rscale")) - return knl(queue, - centers=centers, - src_rscale=src_rscale, tgt_rscale=tgt_rscale, - **kwargs) + knl = self.get_cached_kernel_executor() + result = actx.call_loopy( + knl, + centers=centers, + src_rscale=src_rscale, tgt_rscale=tgt_rscale, + **kwargs) + + return result["tgt_expansions"] # }}} diff --git a/sumpy/e2p.py b/sumpy/e2p.py index 55af69b3..8d185698 100644 --- a/sumpy/e2p.py +++ b/sumpy/e2p.py @@ -25,9 +25,11 @@ import numpy as np import loopy as lp -from sumpy.tools import KernelCacheMixin, gather_loopy_arguments +from pytools.obj_array import make_obj_array + +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program from sumpy.codegen import register_optimization_preambles -from loopy.version import MOST_RECENT_LANGUAGE_VERSION +from sumpy.tools import KernelCacheMixin, gather_loopy_arguments __doc__ = """ @@ -42,11 +44,10 @@ """ -# {{{ E2P base class +# {{{ E2PBase: base class class E2PBase(KernelCacheMixin, ABC): - def __init__(self, ctx, expansion, kernels, - name=None, device=None): + def __init__(self, expansion, kernels, name=None): """ :arg expansion: a subclass of :class:`sympy.expansion.ExpansionBase` :arg strength_usage: A list of integers indicating which expression @@ -55,29 +56,26 @@ def __init__(self, ctx, expansion, kernels, Default: all kernels use the same strength. """ - if device is None: - device = ctx.devices[0] - - from sumpy.kernel import (SourceTransformationRemover, - TargetTransformationRemover) + from sumpy.kernel import ( + SourceTransformationRemover, TargetTransformationRemover) sxr = SourceTransformationRemover() txr = TargetTransformationRemover() - expansion = expansion.with_kernel( - sxr(expansion.kernel)) + expansion = expansion.with_kernel(sxr(expansion.kernel)) kernels = [sxr(knl) for knl in kernels] for knl in kernels: assert txr(knl) == expansion.kernel - self.context = ctx self.expansion = expansion self.kernels = kernels self.name = name or self.default_name - self.device = device self.dim = expansion.dim @property + def nresults(self): + return len(self.kernels) + @abstractmethod def default_name(self): pass @@ -111,7 +109,7 @@ def get_kernel_scaling_assignment(self): # }}} -# {{{ E2P to single box (L2P, likely) +# {{{ E2PFromSingleBox: E2P to single box (L2P, likely) class E2PFromSingleBox(E2PBase): @property @@ -122,7 +120,7 @@ def get_kernel(self): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[itgt_box]: 0<=itgt_box List[cl.Event]: - return [evt if isinstance(evt, cl.Event) else evt.native_event - for evt in self.events] - - @memoize_method - def result(self): - from boxtree.timing import TimingResult - - if not self.queue.properties & cl.command_queue_properties.PROFILING_ENABLE: - from warnings import warn - warn( - "Profiling was not enabled in the command queue. " - "Timing data will not be collected.", - category=UnableToCollectTimingData, - stacklevel=3) - return TimingResult(wall_elapsed=None) - - if self.events: - pyopencl.wait_for_events(self.native_events) - - result = 0 - for event in self.events: - result += ( - (event.profile.end - event.profile.start) - * _SECONDS_PER_NANOSECOND) - - return TimingResult(wall_elapsed=result) - - def done(self): - return all( - event.get_info(cl.event_info.COMMAND_EXECUTION_STATUS) - == cl.command_execution_status.COMPLETE - for event in self.native_events) + return get_opencl_fft_app(self._setup_actx, shape, dtype, inverse=inverse) # }}} @@ -274,7 +207,8 @@ class SumpyExpansionWrangler(ExpansionWranglerInterface): Type for the preprocessed multipole expansion if used for M2L. """ - def __init__(self, tree_indep, traversal, dtype, fmm_level_to_order, + def __init__(self, + tree_indep, traversal, dtype, fmm_level_to_order, source_extra_kwargs=None, kernel_extra_kwargs=None, self_extra_kwargs=None, @@ -282,7 +216,6 @@ def __init__(self, tree_indep, traversal, dtype, fmm_level_to_order, preprocessed_mpole_dtype=None, *, _disable_translation_classes=False): super().__init__(tree_indep, traversal) - self.issued_timing_data_warning = False self.dtype = dtype @@ -322,13 +255,14 @@ def __init__(self, tree_indep, traversal, dtype, fmm_level_to_order, self.supports_translation_classes = False else: if translation_classes_data is None: - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - from boxtree.translation_classes import TranslationClassesBuilder - translation_classes_builder = TranslationClassesBuilder( - queue.context) - translation_classes_data, _ = translation_classes_builder( - queue, traversal, self.tree, - is_translation_per_level=True) + from boxtree.translation_classes import TranslationClassesBuilder + + actx = tree_indep._setup_actx + translation_classes_builder = TranslationClassesBuilder(actx) + translation_classes_data, _ = translation_classes_builder( + actx, traversal, self.tree, is_translation_per_level=True) + translation_classes_data = actx.freeze(translation_classes_data) + self.supports_translation_classes = True self.translation_classes_data = translation_classes_data @@ -354,9 +288,17 @@ def level_to_rscale(self, level): # {{{ data vector utilities + @property + @memoize_method + def tree_level_start_box_nrs(self): + # NOTE: a host version of `level_start_box_nrs` is used repeatedly and + # this simply caches it to avoid repeated transfers + actx = self.tree_indep._setup_actx + return actx.to_numpy(self.tree.level_start_box_nrs) + def _expansions_level_starts(self, order_to_size): return build_csr_level_starts(self.level_orders, order_to_size, - self.tree.level_start_box_nrs) + self.tree_level_start_box_nrs) @memoize_method def multipole_expansions_level_starts(self): @@ -370,9 +312,10 @@ def local_expansions_level_starts(self): @memoize_method def m2l_translation_class_level_start_box_nrs(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - data = self.translation_classes_data - return data.from_sep_siblings_translation_classes_level_starts.get(queue) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + self.translation_classes_data + .from_sep_siblings_translation_classes_level_starts) @memoize_method def m2l_translation_classes_dependent_data_level_starts(self): @@ -386,68 +329,65 @@ def order_to_size(order): return build_csr_level_starts(self.level_orders, order_to_size, level_starts=self.m2l_translation_class_level_start_box_nrs()) - def multipole_expansion_zeros(self, template_ary): + def multipole_expansion_zeros(self, actx: PyOpenCLArrayContext) -> Array: """Return an expansions array (which must support addition) capable of holding one multipole or local expansion for every box in the tree. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ - return cl.array.zeros( - template_ary.queue, + return actx.zeros( self.multipole_expansions_level_starts()[-1], dtype=self.dtype) - def local_expansion_zeros(self, template_ary): + def local_expansion_zeros(self, actx) -> Array: """Return an expansions array (which must support addition) capable of holding one multipole or local expansion for every box in the tree. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ - return cl.array.zeros( - template_ary.queue, + return actx.zeros( self.local_expansions_level_starts()[-1], dtype=self.dtype) - def m2l_translation_classes_dependent_data_zeros(self, queue): + def m2l_translation_classes_dependent_data_zeros( + self, actx: PyOpenCLArrayContext): + data_level_starts = ( + self.m2l_translation_classes_dependent_data_level_starts()) + level_start_box_nrs = ( + self.m2l_translation_class_level_start_box_nrs()) + result = [] for level in range(self.tree.nlevels): - expn_start, expn_stop = \ - self.m2l_translation_classes_dependent_data_level_starts()[ - level:level+2] - translation_class_start, translation_class_stop = \ - self.m2l_translation_class_level_start_box_nrs()[level:level+2] - exprs_level = cl.array.zeros(queue, expn_stop - expn_start, - dtype=self.preprocessed_mpole_dtype) - result.append(exprs_level.reshape( - translation_class_stop - translation_class_start, -1)) + expn_start, expn_stop = data_level_starts[level:level + 2] + translation_class_start, translation_class_stop = ( + level_start_box_nrs[level:level + 2]) + + exprs_level = actx.zeros( + expn_stop - expn_start, + dtype=self.preprocessed_mpole_dtype + ).reshape(translation_class_stop - translation_class_start, -1) + result.append(exprs_level) + return result def multipole_expansions_view(self, mpole_exps, level): - expn_start, expn_stop = \ - self.multipole_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + expn_start, expn_stop = ( + self.multipole_expansions_level_starts()[level:level + 2]) + box_start, box_stop = self.tree_level_start_box_nrs[level:level + 2] return (box_start, mpole_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) def local_expansions_view(self, local_exps, level): - expn_start, expn_stop = \ - self.local_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] + expn_start, expn_stop = ( + self.local_expansions_level_starts()[level:level + 2]) + box_start, box_stop = self.tree_level_start_box_nrs[level:level + 2] return (box_start, local_exps[expn_start:expn_stop].reshape(box_stop-box_start, -1)) def m2l_translation_classes_dependent_data_view(self, m2l_translation_classes_dependent_data, level): - translation_class_start, _ = \ - self.m2l_translation_class_level_start_box_nrs()[level:level+2] + translation_class_start, _ = ( + self.m2l_translation_class_level_start_box_nrs()[level:level + 2]) exprs_level = m2l_translation_classes_dependent_data[level] return (translation_class_start, exprs_level) @@ -461,49 +401,47 @@ def order_to_size(order): return res return build_csr_level_starts(self.level_orders, order_to_size, - level_starts=self.tree.level_start_box_nrs) + level_starts=self.tree_level_start_box_nrs) + + def m2l_preproc_mpole_expansion_zeros( + self, actx: PyOpenCLArrayContext, template_ary): + level_starts = self.m2l_preproc_mpole_expansions_level_starts() - def m2l_preproc_mpole_expansion_zeros(self, template_ary): result = [] for level in range(self.tree.nlevels): - expn_start, expn_stop = \ - self.m2l_preproc_mpole_expansions_level_starts()[level:level+2] - box_start, box_stop = self.tree.level_start_box_nrs[level:level+2] - exprs_level = cl.array.zeros(template_ary.queue, expn_stop - expn_start, - dtype=self.preprocessed_mpole_dtype) - result.append(exprs_level.reshape(box_stop - box_start, -1)) + expn_start, expn_stop = level_starts[level:level+2] + box_start, box_stop = self.tree_level_start_box_nrs[level:level+2] + + exprs_level = actx.zeros( + expn_stop - expn_start, + dtype=self.preprocessed_mpole_dtype, + ).reshape(box_stop - box_start, -1) + result.append(exprs_level) + return result def m2l_preproc_mpole_expansions_view(self, mpole_exps, level): - box_start, _ = self.tree.level_start_box_nrs[level:level+2] + box_start, _ = self.tree_level_start_box_nrs[level:level+2] return (box_start, mpole_exps[level]) m2l_work_array_view = m2l_preproc_mpole_expansions_view m2l_work_array_zeros = m2l_preproc_mpole_expansion_zeros - m2l_work_array_level_starts = \ - m2l_preproc_mpole_expansions_level_starts + m2l_work_array_level_starts = m2l_preproc_mpole_expansions_level_starts - def output_zeros(self, template_ary): + def output_zeros(self, actx: PyOpenCLArrayContext) -> np.ndarray: """Return a potentials array (which must support addition) capable of holding a potential value for each target in the tree. Note that :func:`drive_fmm` makes no assumptions about *potential* other than that it supports addition--it may consist of potentials, gradients of the potential, or arbitrary other per-target output data. - :arg template_ary: an array (not necessarily of the same shape or dtype as - the one to be created) whose run-time environment - (e.g. :class:`pyopencl.CommandQueue`) the returned array should - reuse. """ from pytools.obj_array import make_obj_array return make_obj_array([ - cl.array.zeros( - template_ary.queue, - self.tree.ntargets, - dtype=self.dtype) + actx.zeros(self.tree.ntargets, dtype=self.dtype) for k in self.tree_indep.target_kernels]) def reorder_sources(self, source_array): - return source_array.with_queue(source_array.queue)[self.tree.user_source_ids] + return source_array[self.tree.user_source_ids] def reorder_potentials(self, potentials): from pytools.obj_array import obj_array_vectorize @@ -520,16 +458,18 @@ def reorder(x): @property @memoize_method def max_nsources_in_one_box(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - return int(pyopencl.array.max(self.tree.box_source_counts_nonchild, - queue).get()) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + actx.np.max(self.tree.box_source_counts_nonchild) + ).item() @property @memoize_method def max_ntargets_in_one_box(self): - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - return int(pyopencl.array.max(self.tree.box_target_counts_nonchild, - queue).get()) + actx = self.tree_indep._setup_actx + return actx.to_numpy( + actx.np.max(self.tree.box_target_counts_nonchild) + ).item() # }}} @@ -553,22 +493,29 @@ def box_target_list_kwargs(self): # }}} - def run_opencl_fft(self, queue, input_vec, inverse, wait_for): + def run_opencl_fft(self, actx: PyOpenCLArrayContext, + input_vec, inverse, wait_for): app = self.tree_indep.opencl_fft_app(input_vec.shape, input_vec.dtype, inverse) - return run_opencl_fft(app, queue, input_vec, inverse, wait_for) + evt, result = run_opencl_fft( + actx, app, input_vec, inverse=inverse, wait_for=wait_for) + + from sumpy.tools import get_native_event + input_vec.add_event(get_native_event(evt)) + result.add_event(get_native_event(evt)) + + return result def form_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_box_nrs, source_boxes, src_weight_vecs): - mpoles = self.multipole_expansion_zeros(src_weight_vecs[0]) + mpoles = self.multipole_expansion_zeros(actx) + level_start_source_box_nrs = actx.to_numpy(level_start_source_box_nrs) kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - for lev in range(self.tree.nlevels): p2m = self.tree_indep.p2m(self.level_orders[lev]) start, stop = level_start_source_box_nrs[lev:lev+2] @@ -578,8 +525,8 @@ def form_multipoles(self, level_start_ibox, mpoles_view = self.multipole_expansions_view( mpoles, lev) - evt, (mpoles_res,) = p2m( - queue, + mpoles_res = p2m( + actx, source_boxes=source_boxes[start:stop], centers=self.tree.box_centers, strengths=src_weight_vecs, @@ -588,20 +535,19 @@ def form_multipoles(self, rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) assert mpoles_res is mpoles_view - return (mpoles, SumpyTimingFuture(queue, events)) + return mpoles def coarsen_multipoles(self, + actx: PyOpenCLArrayContext, level_start_source_parent_box_nrs, source_parent_boxes, mpoles): tree = self.tree - - events = [] - queue = mpoles.queue + level_start_source_parent_box_nrs = ( + actx.to_numpy(level_start_source_parent_box_nrs)) # nlevels-1 is the last valid level index # nlevels-2 is the last valid level that could have children @@ -623,13 +569,13 @@ def coarsen_multipoles(self, self.level_orders[source_level], self.level_orders[target_level]) - source_level_start_ibox, source_mpoles_view = \ - self.multipole_expansions_view(mpoles, source_level) - target_level_start_ibox, target_mpoles_view = \ - self.multipole_expansions_view(mpoles, target_level) + source_level_start_ibox, source_mpoles_view = ( + self.multipole_expansions_view(mpoles, source_level)) + target_level_start_ibox, target_mpoles_view = ( + self.multipole_expansions_view(mpoles, target_level)) - evt, (mpoles_res,) = m2m( - queue, + mpoles_res = m2m( + actx, src_expansions=source_mpoles_view, src_base_ibox=source_level_start_ibox, tgt_expansions=target_mpoles_view, @@ -643,28 +589,23 @@ def coarsen_multipoles(self, tgt_rscale=self.level_to_rscale(target_level), **self.kernel_extra_kwargs) - events.append(evt) assert mpoles_res is target_mpoles_view - if events: - mpoles.add_event(events[-1]) - - return (mpoles, SumpyTimingFuture(queue, events)) + return mpoles - def eval_direct(self, target_boxes, source_box_starts, + def eval_direct(self, + actx: PyOpenCLArrayContext, + target_boxes, source_box_starts, source_box_lists, src_weight_vecs): - pot = self.output_zeros(src_weight_vecs[0]) + pot = self.output_zeros(actx) kwargs = self.extra_kwargs.copy() kwargs.update(self.self_extra_kwargs) kwargs.update(self.box_source_list_kwargs()) kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - - evt, pot_res = self.tree_indep.p2p()(queue, + pot_res = self.tree_indep.p2p()(actx, target_boxes=target_boxes, source_box_starts=source_box_starts, source_box_lists=source_box_lists, @@ -673,67 +614,65 @@ def eval_direct(self, target_boxes, source_box_starts, max_nsources_in_one_box=self.max_nsources_in_one_box, max_ntargets_in_one_box=self.max_ntargets_in_one_box, **kwargs) - events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - pot_i.add_event(evt) - return (pot, SumpyTimingFuture(queue, events)) + return pot @memoize_method def multipole_to_local_precompute(self): - result = [] - with cl.CommandQueue(self.tree_indep.cl_context) as queue: - m2l_translation_classes_dependent_data = \ - self.m2l_translation_classes_dependent_data_zeros(queue) - for lev in range(self.tree.nlevels): - src_rscale = self.level_to_rscale(lev) - order = self.level_orders[lev] - precompute_kernel = \ - self.tree_indep.m2l_translation_class_dependent_data_kernel( - order, order) - - translation_classes_level_start, \ - m2l_translation_classes_dependent_data_view = \ - self.m2l_translation_classes_dependent_data_view( - m2l_translation_classes_dependent_data, lev) - - ntranslation_classes = \ - m2l_translation_classes_dependent_data_view.shape[0] + actx = self.tree_indep._setup_actx - if ntranslation_classes == 0: - result.append(pyopencl.array.empty_like( - m2l_translation_classes_dependent_data_view)) - continue + result = [] + m2l_translation_classes_dependent_data = ( + self.m2l_translation_classes_dependent_data_zeros(actx)) - data = self.translation_classes_data - m2l_translation_vectors = ( - data.from_sep_siblings_translation_class_to_distance_vector) - - evt, _ = precompute_kernel( - queue, - src_rscale=src_rscale, - translation_classes_level_start=translation_classes_level_start, - ntranslation_classes=ntranslation_classes, - m2l_translation_classes_dependent_data=( - m2l_translation_classes_dependent_data_view), - m2l_translation_vectors=m2l_translation_vectors, - ntranslation_vectors=m2l_translation_vectors.shape[1], - **self.kernel_extra_kwargs + for lev in range(self.tree.nlevels): + src_rscale = self.level_to_rscale(lev) + order = self.level_orders[lev] + precompute_kernel = ( + self.tree_indep.m2l_translation_class_dependent_data_kernel( + order, order) ) - if self.tree_indep.m2l_translation.use_fft: - _, m2l_translation_classes_dependent_data_view = \ - self.run_opencl_fft(queue, - m2l_translation_classes_dependent_data_view, - inverse=False, wait_for=[evt]) - result.append(m2l_translation_classes_dependent_data_view) + translation_classes_level_start, \ + m2l_translation_classes_dependent_data_view = \ + self.m2l_translation_classes_dependent_data_view( + m2l_translation_classes_dependent_data, lev) - for lev in range(self.tree.nlevels): - result[lev].finish() + ntranslation_classes = ( + m2l_translation_classes_dependent_data_view.shape[0]) - result = [arr.with_queue(None) for arr in result] + if ntranslation_classes == 0: + result.append(actx.np.zeros_like( + m2l_translation_classes_dependent_data_view)) + continue + + data = self.translation_classes_data + m2l_translation_vectors = ( + data.from_sep_siblings_translation_class_to_distance_vector) + + precompute_kernel( + actx, + src_rscale=src_rscale, + translation_classes_level_start=translation_classes_level_start, + ntranslation_classes=ntranslation_classes, + m2l_translation_classes_dependent_data=( + m2l_translation_classes_dependent_data_view), + m2l_translation_vectors=m2l_translation_vectors, + ntranslation_vectors=m2l_translation_vectors.shape[1], + **self.kernel_extra_kwargs + ) + + if self.tree_indep.m2l_translation.use_fft: + m2l_translation_classes_dependent_data_view = ( + self.run_opencl_fft(actx, + m2l_translation_classes_dependent_data_view, + inverse=False, wait_for=None)) + result.append(m2l_translation_classes_dependent_data_view) + + result = [actx.freeze(arr) for arr in result] return result def _add_m2l_precompute_kwargs(self, kwargs_for_m2l, @@ -758,17 +697,18 @@ def _add_m2l_precompute_kwargs(self, kwargs_for_m2l, self.translation_classes_data.from_sep_siblings_translation_classes def multipole_to_local(self, + actx: PyOpenCLArrayContext, level_start_target_box_nrs, target_boxes, src_box_starts, src_box_lists, mpole_exps): - queue = mpole_exps.queue - local_exps = self.local_expansion_zeros(mpole_exps) + local_exps = self.local_expansion_zeros(actx) + level_start_target_box_nrs = actx.to_numpy(level_start_target_box_nrs) if self.tree_indep.m2l_translation.use_preprocessing: - preprocessed_mpole_exps = \ - self.m2l_preproc_mpole_expansion_zeros(mpole_exps) - m2l_work_array = self.m2l_work_array_zeros(local_exps) + preprocessed_mpole_exps = ( + self.m2l_preproc_mpole_expansion_zeros(actx, mpole_exps)) + m2l_work_array = self.m2l_work_array_zeros(actx, local_exps) mpole_exps_view_func = self.m2l_preproc_mpole_expansions_view local_exps_view_func = self.m2l_work_array_view else: @@ -777,10 +717,6 @@ def multipole_to_local(self, mpole_exps_view_func = self.multipole_expansions_view local_exps_view_func = self.local_expansions_view - preprocess_evts = [] - translate_evts = [] - postprocess_evts = [] - for lev in range(self.tree.nlevels): wait_for = [] @@ -801,25 +737,20 @@ def multipole_to_local(self, # There is no M2L happening in this level continue - evt, _ = preprocess_mpole_kernel( - queue, + preprocess_mpole_kernel( + actx, src_expansions=source_mpoles_view, preprocessed_src_expansions=preprocessed_mpole_exps[lev], src_rscale=self.level_to_rscale(lev), wait_for=wait_for, **self.kernel_extra_kwargs ) - wait_for.append(evt) if self.tree_indep.m2l_translation.use_fft: - evt_fft, preprocessed_mpole_exps[lev] = \ - self.run_opencl_fft(queue, + preprocessed_mpole_exps[lev] = \ + self.run_opencl_fft(actx, preprocessed_mpole_exps[lev], inverse=False, wait_for=wait_for) - wait_for.append(get_native_event(evt_fft)) - evt = AggregateProfilingEvent([evt, evt_fft]) - - preprocess_evts.append(evt) order = self.level_orders[lev] m2l = self.tree_indep.m2l(order, order, @@ -851,9 +782,7 @@ def multipole_to_local(self, kwargs["m2l_translation_classes_dependent_data"].size == 0: # There is nothing to do for this level continue - evt, _ = m2l(queue, **kwargs, wait_for=wait_for) - wait_for.append(evt) - translate_evts.append(evt) + m2l(actx, **kwargs, wait_for=wait_for) if self.tree_indep.m2l_translation.use_preprocessing: order = self.level_orders[lev] @@ -873,14 +802,13 @@ def multipole_to_local(self, continue if self.tree_indep.m2l_translation.use_fft: - evt_fft, target_locals_before_postprocessing_view = \ - self.run_opencl_fft(queue, + target_locals_before_postprocessing_view = \ + self.run_opencl_fft(actx, target_locals_before_postprocessing_view, inverse=True, wait_for=wait_for) - wait_for.append(get_native_event(evt_fft)) - evt, _ = postprocess_local_kernel( - queue, + postprocess_local_kernel( + actx, tgt_expansions=target_locals_view, tgt_expansions_before_postprocessing=( target_locals_before_postprocessing_view), @@ -890,27 +818,17 @@ def multipole_to_local(self, **self.kernel_extra_kwargs, ) - if self.tree_indep.m2l_translation.use_fft: - postprocess_evts.append(AggregateProfilingEvent([evt_fft, evt])) - else: - postprocess_evts.append(evt) - - timing_events = preprocess_evts + translate_evts + postprocess_evts - - return (local_exps, SumpyTimingFuture(queue, timing_events)) + return local_exps def eval_multipoles(self, + actx: PyOpenCLArrayContext, target_boxes_by_source_level, source_boxes_by_level, mpole_exps): - pot = self.output_zeros(mpole_exps) + pot = self.output_zeros(actx) kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = mpole_exps.queue - wait_for = mpole_exps.events - for isrc_level, ssn in enumerate(source_boxes_by_level): if len(target_boxes_by_source_level[isrc_level]) == 0: continue @@ -920,8 +838,8 @@ def eval_multipoles(self, source_level_start_ibox, source_mpoles_view = \ self.multipole_expansions_view(mpole_exps, isrc_level) - evt, pot_res = m2p( - queue, + pot_res = m2p( + actx, src_expansions=source_mpoles_view, src_base_ibox=source_level_start_ibox, @@ -937,33 +855,26 @@ def eval_multipoles(self, wait_for=wait_for, **kwargs) - events.append(evt) - - wait_for = [evt] for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - if events: - for pot_i in pot: - pot_i.add_event(events[-1]) - - return (pot, SumpyTimingFuture(queue, events)) + return pot def form_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, starts, lists, src_weight_vecs): - local_exps = self.local_expansion_zeros(src_weight_vecs[0]) + local_exps = self.local_expansion_zeros(actx) + level_start_target_or_target_parent_box_nrs = ( + actx.to_numpy(level_start_target_or_target_parent_box_nrs)) kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) - events = [] - queue = src_weight_vecs[0].queue - for lev in range(self.tree.nlevels): - start, stop = \ - level_start_target_or_target_parent_box_nrs[lev:lev+2] + start, stop = ( + level_start_target_or_target_parent_box_nrs[lev:lev+2]) if start == stop: continue @@ -972,8 +883,8 @@ def form_locals(self, target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, lev) - evt, (result,) = p2l( - queue, + result = p2l( + actx, target_boxes=target_or_target_parent_boxes[start:stop], source_box_starts=starts[start:stop+1], source_box_lists=lists, @@ -986,23 +897,22 @@ def form_locals(self, rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) assert result is target_local_exps_view - return (local_exps, SumpyTimingFuture(queue, events)) + return local_exps def refine_locals(self, + actx: PyOpenCLArrayContext, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): - - events = [] - queue = local_exps.queue + level_start_target_or_target_parent_box_nrs = ( + actx.to_numpy(level_start_target_or_target_parent_box_nrs)) for target_lev in range(1, self.tree.nlevels): - start, stop = level_start_target_or_target_parent_box_nrs[ - target_lev:target_lev+2] + start, stop = ( + level_start_target_or_target_parent_box_nrs[target_lev:target_lev+2]) if start == stop: continue @@ -1016,7 +926,7 @@ def refine_locals(self, target_level_start_ibox, target_local_exps_view = \ self.local_expansions_view(local_exps, target_lev) - evt, (local_exps_res,) = l2l(queue, + local_exps_res = l2l(actx, src_expansions=source_local_exps_view, src_base_ibox=source_level_start_ibox, tgt_expansions=target_local_exps_view, @@ -1030,24 +940,20 @@ def refine_locals(self, tgt_rscale=self.level_to_rscale(target_lev), **self.kernel_extra_kwargs) - events.append(evt) assert local_exps_res is target_local_exps_view - for evt in events: - local_exps.add_event(evt) + return local_exps - return (local_exps, SumpyTimingFuture(queue, events)) - - def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): - pot = self.output_zeros(local_exps) + def eval_locals(self, + actx: PyOpenCLArrayContext, + level_start_target_box_nrs, target_boxes, local_exps): + pot = self.output_zeros(actx) + level_start_target_box_nrs = actx.to_numpy(level_start_target_box_nrs) kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) - events = [] - queue = local_exps.queue - for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -1058,8 +964,8 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): source_level_start_ibox, source_local_exps_view = \ self.local_expansions_view(local_exps, lev) - evt, pot_res = l2p( - queue, + pot_res = l2p( + actx, src_expansions=source_local_exps_view, src_base_ibox=source_level_start_ibox, @@ -1071,14 +977,13 @@ def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): rscale=self.level_to_rscale(lev), **kwargs) - events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - return (pot, SumpyTimingFuture(queue, events)) + return pot - def finalize_potentials(self, potentials, template_ary): + def finalize_potentials(self, actx: PyOpenCLArrayContext, potentials): return potentials # }}} diff --git a/sumpy/p2e.py b/sumpy/p2e.py index acc7e726..5f7526c7 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -22,8 +22,8 @@ import numpy as np import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program from sumpy.tools import KernelCacheMixin, KernelComputation from sumpy.codegen import register_optimization_preambles @@ -42,7 +42,7 @@ """ -# {{{ P2E base class +# {{{ P2EBase: base class class P2EBase(KernelCacheMixin, KernelComputation): """Common input processing for kernel computations. @@ -50,8 +50,7 @@ class P2EBase(KernelCacheMixin, KernelComputation): .. automethod:: __init__ """ - def __init__(self, ctx, expansion, kernels=None, - name=None, device=None, strength_usage=None): + def __init__(self, expansion, kernels=None, name=None, strength_usage=None): """ :arg expansion: a subclass of :class:`sumpy.expansion.ExpansionBase` :arg kernels: if not provided, the kernel of the *expansion* is used. @@ -79,10 +78,10 @@ def __init__(self, ctx, expansion, kernels=None, assert txr(knl) == knl assert sxr(knl) == expansion.kernel - KernelComputation.__init__(self, ctx=ctx, target_kernels=[], + KernelComputation.__init__(self, target_kernels=[], source_kernels=kernels, strength_usage=strength_usage, value_dtypes=None, - name=name, device=device) + name=name) self.expansion = expansion self.dim = expansion.dim @@ -122,25 +121,31 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = register_optimization_preambles(knl, self.device) return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): from sumpy.tools import is_obj_array_like sources = kwargs.pop("sources") centers = kwargs.pop("centers") - knl = self.get_cached_kernel_executor( - sources_is_obj_array=is_obj_array_like(sources), - centers_is_obj_array=is_obj_array_like(centers)) # "1" may be passed for rscale, which won't have its type # meaningfully inferred. Make the type of rscale explicit. dtype = centers[0].dtype if is_obj_array_like(centers) else centers.dtype rscale = dtype.type(kwargs.pop("rscale")) - return knl(queue, sources=sources, centers=centers, rscale=rscale, **kwargs) + knl = self.get_cached_kernel_executor( + sources_is_obj_array=is_obj_array_like(sources), + centers_is_obj_array=is_obj_array_like(centers)) + + result = actx.call_loopy( + knl, + sources=sources, centers=centers, rscale=rscale, + **kwargs) + + return result["tgt_expansions"] # }}} -# {{{ P2E from single box (P2M, likely) +# {{{ P2EFromSingleBox: P2E from single box (P2M, likely) class P2EFromSingleBox(P2EBase): """ @@ -155,7 +160,7 @@ def get_kernel(self): ncoeffs = len(self.expansion) loopy_args = self.get_loopy_args() - loopy_knl = lp.make_kernel([ + loopy_knl = make_loopy_program([ "{[isrc_box]: 0 <= isrc_box < nsrc_boxes}", "{[isrc]: isrc_start <= isrc < isrc_end}", "{[idim]: 0 <= idim < dim}", @@ -212,11 +217,9 @@ def get_kernel(self): name=self.name, assumptions="nsrc_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "nstrengths": self.strength_count, - "ncoeffs": ncoeffs}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + "ncoeffs": ncoeffs}) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr") @@ -233,7 +236,7 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = lp.split_iname(knl, "isrc_box", 16, outer_tag="g.0") return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg source_boxes: an array of integer indices into *box_source_starts* and *box_source_counts_nonchild*. @@ -253,12 +256,12 @@ def __call__(self, queue, **kwargs): :returns: an array of *tgt_expansions*. """ - return super().__call__(queue, **kwargs) + return super().__call__(actx, **kwargs) # }}} -# {{{ P2E from CSR-like interaction list +# {{{ P2EFromCSR: P2E from CSR-like interaction list class P2EFromCSR(P2EBase): """ @@ -293,7 +296,7 @@ def get_kernel(self): ... ]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( [ "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_stop}", @@ -341,15 +344,13 @@ def get_kernel(self): dep=update_result:init_coeffs} end """], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", - default_offset=lp.auto, fixed_parameters={"dim": self.dim, "nstrengths": self.strength_count, - "ncoeffs": ncoeffs}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + "ncoeffs": ncoeffs}) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.tag_inames(loopy_knl, "istrength*:unr") @@ -366,7 +367,7 @@ def get_optimized_kernel(self, sources_is_obj_array, centers_is_obj_array): knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): """ :arg target_boxes: array of integer indices into *source_box_starts* and *centers*. @@ -385,7 +386,7 @@ def __call__(self, queue, **kwargs): :arg tgt_base_ibox: see :meth:`P2EFromSingleBox.__call__`. :arg tgt_expansion: see :meth:`P2EFromSingleBox.__call__`. """ - return super().__call__(queue, **kwargs) + return super().__call__(actx, **kwargs) # }}} diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 9c4302d6..c2f9ae26 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -25,11 +25,15 @@ import numpy as np import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION -from sumpy.tools import ( - KernelComputation, KernelCacheMixin, is_obj_array_like) +from pytools.obj_array import make_obj_array + +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program from sumpy.codegen import register_optimization_preambles +from sumpy.tools import KernelComputation, KernelCacheMixin, is_obj_array_like + +import logging +logger = logging.getLogger(__name__) __doc__ = """ @@ -49,11 +53,11 @@ # LATER: # - Optimization for source == target (postpone) -# {{{ p2p base class +# {{{ P2PBase: base class class P2PBase(KernelCacheMixin, KernelComputation): - def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None, - value_dtypes=None, name=None, device=None, source_kernels=None): + def __init__(self, target_kernels, exclude_self, strength_usage=None, + value_dtypes=None, name=None, source_kernels=None): """ :arg target_kernels: list of :class:`sumpy.kernel.Kernel` instances with only target derivatives. @@ -65,8 +69,9 @@ def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None, Default: all kernels use the same strength. """ from pytools import single_valued - from sumpy.kernel import (TargetTransformationRemover, - SourceTransformationRemover) + from sumpy.kernel import ( + TargetTransformationRemover, SourceTransformationRemover) + txr = TargetTransformationRemover() sxr = SourceTransformationRemover() @@ -83,23 +88,19 @@ def __init__(self, ctx, target_kernels, exclude_self, strength_usage=None, base_target_kernel = single_valued(txr(knl) for knl in target_kernels) assert base_source_kernel == base_target_kernel - KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels, + KernelComputation.__init__(self, target_kernels=target_kernels, source_kernels=source_kernels, strength_usage=strength_usage, - value_dtypes=value_dtypes, name=name, device=device) - - import pyopencl as cl - self.is_gpu = not (self.device.type & cl.device_type.CPU) + value_dtypes=value_dtypes, name=name) self.exclude_self = exclude_self - - self.dim = single_valued(knl.dim for knl in - list(self.target_kernels) + list(self.source_kernels)) + self.dim = single_valued([ + knl.dim for knl in self.target_kernels + self.source_kernels + ]) def get_cache_key(self): return (type(self).__name__, tuple(self.target_kernels), self.exclude_self, tuple(self.strength_usage), tuple(self.value_dtypes), - tuple(self.source_kernels), - self.device.hashable_model_and_version_identifier) + tuple(self.source_kernels)) def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector @@ -192,14 +193,13 @@ def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): enforce_variable_access_ordered="no_check") knl = register_optimization_preambles(knl, self.device) - return knl # }}} -# {{{ P2P point-interaction calculation +# {{{ P2P: point-interaction calculation class P2P(P2PBase): """Direct applier for P2P interactions.""" @@ -219,7 +219,7 @@ def get_kernel(self): shape="nresults, ntargets", dim_tags="sep,C") ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -238,15 +238,14 @@ def get_kernel(self): simul_reduce(sum, isrc, pair_result_{iknl}) {{inames=itgt}} """ for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, assumptions="nsources>=1 and ntargets>=1", name=self.name, - default_offset=lp.auto, fixed_parameters={ "dim": self.dim, "nstrengths": self.strength_count, "nresults": len(self.target_kernels)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -255,18 +254,28 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, strength, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, + targets, sources, strength, **kwargs): knl = self.get_cached_kernel_executor( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, sources=sources, targets=targets, strength=strength, - **kwargs) + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, actx.queue.device) + + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + strength=strength, + **kwargs) + + return make_obj_array([result[f"result_s{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P matrix writer +# {{{ P2PMatrixGenerator: matrix writer class P2PMatrixGenerator(P2PBase): """Generator for P2P interaction matrix entries.""" @@ -287,7 +296,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -308,7 +317,7 @@ def get_kernel(self): assumptions="nsources>=1 and ntargets>=1", name=self.name, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -317,17 +326,21 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, targets, sources, **kwargs): knl = self.get_cached_kernel_executor( targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, sources=sources, targets=targets, **kwargs) + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, actx.queue.device) + + result = actx.call_loopy(knl, sources=sources, targets=targets, **kwargs) + return make_obj_array([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P matrix subset generator +# {{{ P2PMatrixSubsetGenerator: matrix subset generator class P2PMatrixSubsetGenerator(P2PBase): """Generator for a subset of P2P interaction matrix entries. @@ -359,7 +372,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement @@ -387,7 +400,7 @@ def get_kernel(self): silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes( @@ -415,7 +428,8 @@ def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): return knl - def __call__(self, queue, targets, sources, tgtindices, srcindices, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, + targets, sources, tgtindices, srcindices, **kwargs): """Evaluate a subset of the P2P matrix interactions. :arg targets: target point coordinates, which can be an object @@ -435,24 +449,31 @@ def __call__(self, queue, targets, sources, tgtindices, srcindices, **kwargs): targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources)) - return knl(queue, - targets=targets, - sources=sources, - tgtindices=tgtindices, - srcindices=srcindices, **kwargs) + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, actx.queue.device) + + result = actx.call_loopy( + knl, + targets=targets, + sources=sources, + tgtindices=tgtindices, + srcindices=srcindices, **kwargs) + + return make_obj_array([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ P2P from CSR-like interaction list +# {{{ P2PFromCSR: P2P from CSR-like interaction list class P2PFromCSR(P2PBase): @property def default_name(self): return "p2p_from_csr" - def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, - work_items_per_group=32): + def get_kernel(self, + max_nsources_in_one_box: int, max_ntargets_in_one_box: int, *, + work_items_per_group: int = 32, is_gpu: bool = False): loopy_insns, result_names = self.get_loopy_insns_and_result_names() arguments = self.get_default_src_tgt_arguments() \ + [ @@ -473,7 +494,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, lp.GlobalArg("result", None, shape="noutputs, ntargets", dim_tags="sep,C"), lp.TemporaryVariable("tgt_center", shape=(self.dim,)), - "..." + ... ] domains = [ @@ -485,7 +506,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, tgt_outer_limit = (max_ntargets_in_one_box - 1) // work_items_per_group - if self.is_gpu: + if is_gpu: arguments += [ lp.TemporaryVariable("local_isrc", shape=(self.dim, max_nsources_in_one_box)), @@ -508,7 +529,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, # There are two algorithms here because pocl-pthread 1.9 miscompiles # the "gpu" kernel with prefetching. - if self.is_gpu: + if is_gpu: instructions = (self.get_kernel_scaling_assignments() + [""" for itgt_box @@ -629,13 +650,15 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, end """]) - loopy_knl = lp.make_kernel( + loopy_knl = make_loopy_program( domains, instructions, - arguments, + kernel_data=arguments, assumptions="ntgt_boxes>=1", name=self.name, - silenced_warnings=["write_race(write_csr*)", "write_race(prefetch_src)", + silenced_warnings=[ + "write_race(write_csr*)", + "write_race(prefetch_src)", "write_race(prefetch_charge)"], fixed_parameters={ "dim": self.dim, @@ -645,7 +668,7 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, "work_items_per_group": work_items_per_group, "tgt_outer_limit": tgt_outer_limit, "noutputs": len(self.target_kernels)}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.add_dtypes( loopy_knl, {"nsources": np.int32, "ntargets": np.int32}) @@ -660,11 +683,15 @@ def get_kernel(self, max_nsources_in_one_box, max_ntargets_in_one_box, return loopy_knl - def get_optimized_kernel(self, max_nsources_in_one_box, - max_ntargets_in_one_box, dtype_size): - if not self.is_gpu: + def get_optimized_kernel(self, + max_nsources_in_one_box: int, + max_ntargets_in_one_box: int, + dtype_size: int, + local_mem_size: int, + is_gpu: bool): + if not is_gpu: knl = self.get_kernel(max_nsources_in_one_box, - max_ntargets_in_one_box) + max_ntargets_in_one_box, is_gpu=is_gpu) knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") knl = self._allow_redundant_execution_of_knl_scaling(knl) else: @@ -673,11 +700,12 @@ def get_optimized_kernel(self, max_nsources_in_one_box, (self.dim + self.strength_count) * dtype_size # multiplying by 2 here to make sure at least 2 work groups # can be scheduled at the same time for latency hiding - nprefetch = (2 * total_local_mem - 1) // self.device.local_mem_size + 1 + nprefetch = (2 * total_local_mem - 1) // local_mem_size + 1 knl = self.get_kernel(max_nsources_in_one_box, max_ntargets_in_one_box, - work_items_per_group=work_items_per_group) + work_items_per_group=work_items_per_group, + is_gpu=is_gpu) knl = lp.tag_inames(knl, {"itgt_box": "g.0", "inner": "l.0"}) knl = lp.set_temporary_address_space(knl, ["local_isrc", "local_isrc_strength"], lp.AddressSpace.LOCAL) @@ -720,14 +748,15 @@ def get_optimized_kernel(self, max_nsources_in_one_box, enforce_variable_access_ordered="no_check") knl = register_optimization_preambles(knl, self.device) - return knl - def __call__(self, queue, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, **kwargs): + from sumpy.array_context import is_cl_cpu max_nsources_in_one_box = kwargs.pop("max_nsources_in_one_box") max_ntargets_in_one_box = kwargs.pop("max_ntargets_in_one_box") - if self.is_gpu: + is_gpu = not is_cl_cpu(actx) + if is_gpu: dtype_size = kwargs.get("sources")[0].dtype.alignment else: dtype_size = None @@ -736,9 +765,14 @@ def __call__(self, queue, **kwargs): max_nsources_in_one_box=max_nsources_in_one_box, max_ntargets_in_one_box=max_ntargets_in_one_box, dtype_size=dtype_size, - ) + local_mem_size=actx.queue.device.local_mem_size, + is_gpu=is_gpu) + + from sumpy.codegen import register_optimization_preambles + knl = register_optimization_preambles(knl, actx.queue.device) - return knl(queue, **kwargs) + result = actx.call_loopy(knl, **kwargs) + return make_obj_array([result[f"result_s{i}"] for i in range(self.nresults)]) # }}} diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 9c903ab1..d9ee7cd5 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -23,16 +23,16 @@ THE SOFTWARE. """ +from typing import Tuple, Union import numpy as np import loopy as lp -from loopy.version import MOST_RECENT_LANGUAGE_VERSION -import sumpy.symbolic as sym + from pytools import memoize_method -from pymbolic import parse, var +from pytools.obj_array import make_obj_array -from sumpy.tools import ( - KernelComputation, KernelCacheMixin, is_obj_array_like) +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program, is_cl_cpu +from sumpy.tools import KernelComputation, KernelCacheMixin, is_obj_array_like import logging logger = logging.getLogger(__name__) @@ -51,7 +51,7 @@ """ -def stringify_expn_index(i): +def stringify_expn_index(i: Union[Tuple[int, ...], int]) -> str: if isinstance(i, tuple): return "_".join(stringify_expn_index(i_i) for i_i in i) else: @@ -64,18 +64,21 @@ def stringify_expn_index(i): # {{{ layer potential computation -# {{{ base class +# {{{ LayerPotentialBase: base class class LayerPotentialBase(KernelCacheMixin, KernelComputation): - def __init__(self, ctx, expansion, strength_usage=None, - value_dtypes=None, name=None, device=None, + def __init__(self, expansion, strength_usage=None, + value_dtypes=None, name=None, source_kernels=None, target_kernels=None): from pytools import single_valued - KernelComputation.__init__(self, ctx=ctx, target_kernels=target_kernels, - strength_usage=strength_usage, source_kernels=source_kernels, - value_dtypes=value_dtypes, name=name, device=device) + KernelComputation.__init__(self, + target_kernels=target_kernels, + source_kernels=source_kernels, + strength_usage=strength_usage, + value_dtypes=value_dtypes, + name=name) self.dim = single_valued(knl.dim for knl in self.target_kernels) self.expansion = expansion @@ -83,8 +86,7 @@ def __init__(self, ctx, expansion, strength_usage=None, def get_cache_key(self): return (type(self).__name__, self.expansion, tuple(self.target_kernels), tuple(self.source_kernels), tuple(self.strength_usage), - tuple(self.value_dtypes), - self.device.hashable_model_and_version_identifier) + tuple(self.value_dtypes)) def _expand(self, sac, avec, bvec, rscale, isrc): from sumpy.symbolic import PymbolicToSympyMapper @@ -108,6 +110,7 @@ def _evaluate(self, sac, avec, bvec, rscale, expansion_nr, coefficients): coefficients = [tgt_knl.postprocess_at_target(coeff, bvec) for coeff in coefficients] + import sumpy.symbolic as sym assigned_coeffs = [ sym.Symbol( sac.assign_unique( @@ -129,12 +132,14 @@ def get_loopy_insns_and_result_names(self): logger.info("compute expansion expressions: start") + import sumpy.symbolic as sym + import pymbolic as prim rscale = sym.Symbol("rscale") - isrc_sym = var("isrc") + isrc_sym = prim.var("isrc") coefficients = self._expand(sac, avec, bvec, rscale, isrc_sym) result_names = [self._evaluate(sac, avec, bvec, rscale, i, coefficients) - for i in range(len(self.target_kernels))] + for i in range(self.nresults)] logger.info("compute expansion expressions: done") @@ -155,10 +160,12 @@ def get_loopy_insns_and_result_names(self): return loopy_insns, result_names def get_strength_or_not(self, isrc, kernel_idx): - return var(f"strength_{self.strength_usage[kernel_idx]}_isrc") + import pymbolic as prim + return prim.var(f"strength_{self.strength_usage[kernel_idx]}_isrc") def get_kernel_exprs(self, result_names): - exprs = [var(name) for i, name in enumerate(result_names)] + import pymbolic as prim + exprs = [prim.var(name) for i, name in enumerate(result_names)] return [lp.Assignment(id=None, assignee=f"pair_result_{i}", @@ -184,12 +191,15 @@ def get_default_src_tgt_arguments(self): def get_kernel(self): raise NotImplementedError - def get_optimized_kernel(self, - targets_is_obj_array, sources_is_obj_array, centers_is_obj_array, + def get_optimized_kernel(self, *, + is_cpu: bool, + targets_is_obj_array: bool, + sources_is_obj_array: bool, + centers_is_obj_array: bool, # Used by pytential to override the name of the loop to be # parallelized. In the case of QBX, that's the loop over QBX # targets (not global targets). - itgt_name="itgt"): + itgt_name: str = "itgt"): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() @@ -200,9 +210,7 @@ def get_optimized_kernel(self, if centers_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") - import pyopencl as cl - dev = self.context.devices[0] - if dev.type & cl.device_type.CPU: + if is_cpu: loopy_knl = lp.split_iname(loopy_knl, itgt_name, 16, outer_tag="g.0", inner_tag="l.0") loopy_knl = lp.split_iname(loopy_knl, "isrc", 256) @@ -210,8 +218,11 @@ def get_optimized_kernel(self, ["isrc_outer", f"{itgt_name}_inner"]) else: from warnings import warn - warn(f"don't know how to tune layer potential computation for '{dev}'") + warn( + "Do not know how to tune layer potential computation for " + "non-CPU targets") loopy_knl = lp.split_iname(loopy_knl, itgt_name, 128, outer_tag="g.0") + loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl) return loopy_knl @@ -219,7 +230,7 @@ def get_optimized_kernel(self, # }}} -# {{{ direct applier +# {{{ LayerPotential: direct applier class LayerPotential(LayerPotentialBase): """Direct applier for the layer potential. @@ -246,7 +257,7 @@ def get_kernel(self): for i in range(len(self.target_kernels)) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -267,13 +278,12 @@ def get_kernel(self): """.format(i=iknl) for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntargets>=1 and nsources>=1", - default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for knl in self.target_kernels + self.source_kernels: @@ -281,7 +291,8 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, + def __call__(self, actx: PyOpenCLArrayContext, + targets, sources, centers, strengths, expansion_radii, **kwargs): """ :arg strengths: are required to have area elements and quadrature weights @@ -289,6 +300,7 @@ def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, """ knl = self.get_cached_kernel_executor( + is_cpu=is_cl_cpu(actx), targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) @@ -296,13 +308,20 @@ def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, for i, dens in enumerate(strengths): kwargs[f"strength_{i}"] = dens - return knl(queue, sources=sources, targets=targets, center=centers, - expansion_radii=expansion_radii, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + **kwargs) + + return make_obj_array([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ matrix writer +# {{{ LayerPotentialMatrixGenerator: matrix writer class LayerPotentialMatrixGenerator(LayerPotentialBase): """Generator for layer potential matrix entries.""" @@ -326,7 +345,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([""" + loopy_knl = make_loopy_program([""" {[itgt, isrc, idim]: \ 0 <= itgt < ntargets and \ 0 <= isrc < nsources and \ @@ -345,12 +364,11 @@ def get_kernel(self): """.format(i=iknl) for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="ntargets>=1 and nsources>=1", - default_offset=lp.auto, fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") for expn in self.source_kernels + self.target_kernels: @@ -358,19 +376,28 @@ def get_kernel(self): return loopy_knl - def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, + targets, sources, centers, expansion_radii, **kwargs): knl = self.get_cached_kernel_executor( + is_cpu=is_cl_cpu(actx), targets_is_obj_array=is_obj_array_like(targets), sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) - return knl(queue, sources=sources, targets=targets, center=centers, - expansion_radii=expansion_radii, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + **kwargs) + + return make_obj_array([result[f"result_{i}"] for i in range(self.nresults)]) # }}} -# {{{ matrix subset generator +# {{{ LayerPotentialMatrixSubsetGenerator: matrix subset generator class LayerPotentialMatrixSubsetGenerator(LayerPotentialBase): """Generator for a subset of the layer potential matrix entries. @@ -401,7 +428,7 @@ def get_kernel(self): for i, dtype in enumerate(self.value_dtypes) ]) - loopy_knl = lp.make_kernel([ + loopy_knl = make_loopy_program([ "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}" ], self.get_kernel_scaling_assignments() @@ -424,13 +451,12 @@ def get_kernel(self): """.format(i=iknl) for iknl in range(len(self.target_kernels))] + ["end"], - arguments, + kernel_data=arguments, name=self.name, assumptions="nresult>=1", - default_offset=lp.auto, silenced_warnings="write_race(write_lpot*)", fixed_parameters={"dim": self.dim}, - lang_version=MOST_RECENT_LANGUAGE_VERSION) + ) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes( @@ -456,8 +482,9 @@ def get_optimized_kernel(self, loopy_knl = self._allow_redundant_execution_of_knl_scaling(loopy_knl) return loopy_knl - def __call__(self, queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, **kwargs): + def __call__(self, actx: PyOpenCLArrayContext, + targets, sources, centers, expansion_radii, + tgtindices, srcindices, **kwargs): """Evaluate a subset of the QBX matrix interactions. :arg targets: target point coordinates, which can be an object @@ -484,13 +511,16 @@ def __call__(self, queue, targets, sources, centers, expansion_radii, sources_is_obj_array=is_obj_array_like(sources), centers_is_obj_array=is_obj_array_like(centers)) - return knl(queue, - sources=sources, - targets=targets, - center=centers, - expansion_radii=expansion_radii, - tgtindices=tgtindices, - srcindices=srcindices, **kwargs) + result = actx.call_loopy( + knl, + sources=sources, + targets=targets, + center=centers, + expansion_radii=expansion_radii, + tgtindices=tgtindices, + srcindices=srcindices, **kwargs) + + return make_obj_array([result[f"result_{i}"] for i in range(self.nresults)]) # }}} @@ -608,80 +638,84 @@ def __init__(self, data_args, dim, density_var_name, @property @memoize_method def density(self): + import pymbolic as prim self.arguments[self.density_var_name] = \ lp.GlobalArg(self.density_var_name, self.density_dtype, shape="ntargets", order="C") - return parse(f"{self.density_var_name}[itgt]") + return prim.parse(f"{self.density_var_name}[itgt]") @property @memoize_method def density_prime(self): + import pymbolic as prim prime_var_name = f"{self.density_var_name}_prime" self.arguments[prime_var_name] = ( lp.GlobalArg(prime_var_name, self.density_dtype, shape="ntargets", order="C")) - return parse(f"{prime_var_name}[itgt]") + return prim.parse(f"{prime_var_name}[itgt]") @property @memoize_method def side(self): + import pymbolic as prim self.arguments["side"] = ( lp.GlobalArg("side", self.geometry_dtype, shape="ntargets")) - return parse("side[itgt]") + return prim.parse("side[itgt]") @property @memoize_method def normal(self): + import pymbolic as prim self.arguments["normal"] = ( lp.GlobalArg("normal", self.geometry_dtype, shape=("ntargets", self.dim), order="C")) - from pytools.obj_array import make_obj_array return make_obj_array([ - parse(f"normal[itgt, {i}]") + prim.parse(f"normal[itgt, {i}]") for i in range(self.dim)]) @property @memoize_method def tangent(self): + import pymbolic as prim self.arguments["tangent"] = ( lp.GlobalArg("tangent", self.geometry_dtype, shape=("ntargets", self.dim), order="C")) - from pytools.obj_array import make_obj_array return make_obj_array([ - parse(f"tangent[itgt, {i}]") + prim.parse(f"tangent[itgt, {i}]") for i in range(self.dim)]) @property @memoize_method def mean_curvature(self): + import pymbolic as prim self.arguments["mean_curvature"] = ( lp.GlobalArg("mean_curvature", self.geometry_dtype, shape="ntargets", order="C")) - return parse("mean_curvature[itgt]") + return prim.parse("mean_curvature[itgt]") @property @memoize_method def src_derivative_dir(self): + import pymbolic as prim self.arguments["src_derivative_dir"] = ( lp.GlobalArg("src_derivative_dir", self.geometry_dtype, shape=("ntargets", self.dim), order="C")) - from pytools.obj_array import make_obj_array return make_obj_array([ - parse(f"src_derivative_dir[itgt, {i}]") + prim.parse(f"src_derivative_dir[itgt, {i}]") for i in range(self.dim)]) @property @memoize_method def tgt_derivative_dir(self): + import pymbolic as prim self.arguments["tgt_derivative_dir"] = ( lp.GlobalArg("tgt_derivative_dir", self.geometry_dtype, shape=("ntargets", self.dim), order="C")) - from pytools.obj_array import make_obj_array return make_obj_array([ - parse(f"tgt_derivative_dir[itgt, {i}]") + prim.parse(f"tgt_derivative_dir[itgt, {i}]") for i in range(self.dim)]) # }}} diff --git a/sumpy/tools.py b/sumpy/tools.py index 13acbbab..6776ede7 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -38,6 +38,7 @@ from pytools.tag import Tag, tag_dataclass import sumpy.symbolic as sym +from sumpy.array_context import PyOpenCLArrayContext, make_loopy_program if TYPE_CHECKING: import numpy @@ -205,30 +206,6 @@ def build_matrix(op, dtype=None, shape=None): return mat -def vector_to_device(queue, vec): - from pyopencl.array import to_device - from pytools.obj_array import obj_array_vectorize - - def to_dev(ary): - return to_device(queue, ary) - - return obj_array_vectorize(to_dev, vec) - - -def vector_from_device(queue, vec): - from pytools.obj_array import obj_array_vectorize - - def from_dev(ary): - from numbers import Number - if isinstance(ary, (np.number, Number)): - # zero, most likely - return ary - - return ary.get(queue=queue) - - return obj_array_vectorize(from_dev, vec) - - def _merge_kernel_arguments(dictionary, arg): # Check for strict equality until there's a usecase if dictionary.setdefault(arg.name, arg) != arg: @@ -280,13 +257,12 @@ class KernelComputation(ABC): .. automethod:: get_kernel """ - def __init__(self, ctx: Any, + def __init__(self, target_kernels: List["Kernel"], source_kernels: List["Kernel"], strength_usage: Optional[List[int]] = None, value_dtypes: Optional[List["numpy.dtype[Any]"]] = None, - name: Optional[str] = None, - device: Optional[Any] = None) -> None: + name: Optional[str] = None) -> None: """ :arg target_kernels: list of :class:`~sumpy.kernel.Kernel` instances, with :class:`sumpy.kernel.DirectionalTargetDerivative` as @@ -327,12 +303,6 @@ def __init__(self, ctx: Any, # }}} - if device is None: - device = ctx.devices[0] - - self.context = ctx - self.device = device - self.source_kernels = tuple(source_kernels) self.target_kernels = tuple(target_kernels) self.value_dtypes = value_dtypes @@ -341,6 +311,10 @@ def __init__(self, ctx: Any, self.name = name or self.default_name + @property + def nresults(self): + return len(self.target_kernels) + @property @abstractmethod def default_name(self): @@ -734,7 +708,6 @@ def loopy_fft(shape, inverse, complex_dtype, index_dtype=None, factors.append(N1) nfft = n - broadcast_dims = tuple(var(f"j{d}") for d in range(len(shape) - 1)) domains = [ @@ -864,12 +837,11 @@ def loopy_fft(shape, inverse, complex_dtype, index_dtype=None, else: name = f"fft_{n}" - knl = lp.make_kernel( + knl = make_loopy_program( domains, insns, kernel_data=kernel_data, name=name, fixed_parameters=fixed_parameters, - lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, index_dtype=index_dtype, ) @@ -928,7 +900,7 @@ def _get_fft_backend(queue: "pyopencl.CommandQueue") -> FFTBackend: def get_opencl_fft_app( - queue: "pyopencl.CommandQueue", + actx: PyOpenCLArrayContext, shape: Tuple[int, ...], dtype: "numpy.dtype[Any]", inverse: bool) -> Any: @@ -938,22 +910,24 @@ def get_opencl_fft_app( assert dtype.type in (np.float32, np.float64, np.complex64, np.complex128) - backend = _get_fft_backend(queue) + backend = _get_fft_backend(actx.queue) if backend == FFTBackend.loopy: return loopy_fft(shape, inverse=inverse, complex_dtype=dtype.type), backend elif backend == FFTBackend.pyvkfft: from pyvkfft.opencl import VkFFTApp - app = VkFFTApp(shape=shape, dtype=dtype, queue=queue, ndim=1, inplace=False) + app = VkFFTApp( + shape=shape, dtype=dtype, + queue=actx.queue, ndim=1, inplace=False) return app, backend else: raise RuntimeError(f"Unsupported FFT backend {backend}") def run_opencl_fft( + actx: PyOpenCLArrayContext, fft_app: Tuple[Any, FFTBackend], - queue: "pyopencl.CommandQueue", - input_vec: Any, + input_vec: Any, *, inverse: bool = False, wait_for: List["pyopencl.Event"] = None) -> Tuple["pyopencl.Event", Any]: """Runs an FFT on input_vec and returns a :class:`MarkerBasedProfilingEvent` @@ -964,15 +938,15 @@ def run_opencl_fft( app, backend = fft_app if backend == FFTBackend.loopy: - evt, (output_vec,) = app(queue, y=input_vec, wait_for=wait_for) - return (evt, output_vec) + evt, output_vec = app(actx.queue, y=input_vec, wait_for=wait_for) + return (evt, output_vec["x"]) elif backend == FFTBackend.pyvkfft: if wait_for is None: wait_for = [] import pyopencl as cl - import pyopencl.array as cla + queue = actx.queue if queue.device.platform.name == "NVIDIA CUDA": # NVIDIA OpenCL gives wrong event profile values with wait_for # Not passing wait_for will wait for all events queued before @@ -988,7 +962,7 @@ def run_opencl_fft( if app.inplace: raise RuntimeError("inplace fft is not supported") else: - output_vec = cla.empty_like(input_vec, queue) + output_vec = actx.np.empty_like(input_vec) # FIXME: use the public API once # https://github.com/vincefn/pyvkfft/pull/17 is in @@ -999,7 +973,7 @@ def run_opencl_fft( meth = _vkfft_opencl.fft meth(app.app, int(input_vec.data.int_ptr), - int(output_vec.data.int_ptr), int(queue.int_ptr)) + int(output_vec.data.int_ptr), int(actx.queue.int_ptr)) if queue.device.platform.name == "NVIDIA CUDA": end_evt = cl.enqueue_marker(queue) diff --git a/sumpy/toys.py b/sumpy/toys.py index c71f6107..09b72da2 100644 --- a/sumpy/toys.py +++ b/sumpy/toys.py @@ -25,26 +25,23 @@ THE SOFTWARE. """ -from typing import Sequence, Union, Optional, TYPE_CHECKING - -from pytools import memoize_method from numbers import Number from functools import partial -from sumpy.kernel import TargetTransformationRemover +from typing import Any, Sequence, Union, Optional, TYPE_CHECKING -if TYPE_CHECKING: - from sumpy.kernel import Kernel - from sumpy.visualization import FieldPlotter - import pyopencl # noqa: F401 +import numpy as np -import numpy as np # noqa: F401 -import loopy as lp # noqa: F401 -import pyopencl as cl -import pyopencl.array +from pytools import memoize_method +from sumpy.kernel import TargetTransformationRemover +from sumpy.array_context import PyOpenCLArrayContext import logging logger = logging.getLogger(__name__) +if TYPE_CHECKING: + from sumpy.kernel import Kernel + from sumpy.visualization import FieldPlotter + __doc__ = """ This module provides a convenient interface for numerical experiments with @@ -97,27 +94,29 @@ class ToyContext: .. automethod:: __init__ """ - def __init__(self, cl_context: pyopencl.Context, kernel: Kernel, + def __init__(self, kernel: "Kernel", mpole_expn_class=None, local_expn_class=None, expansion_factory=None, extra_source_kwargs=None, - extra_kernel_kwargs=None, m2l_use_fft=None): - self.cl_context = cl_context - self.queue = cl.CommandQueue(self.cl_context) + extra_kernel_kwargs=None, + m2l_use_fft=None): self.kernel = kernel - self.no_target_deriv_kernel = TargetTransformationRemover()(kernel) if expansion_factory is None: from sumpy.expansion import DefaultExpansionFactory expansion_factory = DefaultExpansionFactory() + if mpole_expn_class is None: - mpole_expn_class = \ - expansion_factory.get_multipole_expansion_class(kernel) + mpole_expn_class = ( + expansion_factory.get_multipole_expansion_class(kernel)) + if local_expn_class is None: - from sumpy.expansion.m2l import (NonFFTM2LTranslationClassFactory, - FFTM2LTranslationClassFactory) + from sumpy.expansion.m2l import ( + FFTM2LTranslationClassFactory, + NonFFTM2LTranslationClassFactory) + if m2l_use_fft: m2l_translation_class_factory = FFTM2LTranslationClassFactory() else: @@ -151,40 +150,40 @@ def __init__(self, cl_context: pyopencl.Context, kernel: Kernel, @memoize_method def get_p2p(self): from sumpy.p2p import P2P - return P2P(self.cl_context, (self.kernel,), exclude_self=False) + return P2P((self.kernel,), exclude_self=False) @memoize_method def get_p2m(self, order): from sumpy import P2EFromSingleBox - return P2EFromSingleBox(self.cl_context, + return P2EFromSingleBox( self.mpole_expn_class(self.no_target_deriv_kernel, order), kernels=(self.kernel,)) @memoize_method def get_p2l(self, order): from sumpy import P2EFromSingleBox - return P2EFromSingleBox(self.cl_context, + return P2EFromSingleBox( self.local_expn_class(self.no_target_deriv_kernel, order), kernels=(self.kernel,)) @memoize_method def get_m2p(self, order): from sumpy import E2PFromSingleBox - return E2PFromSingleBox(self.cl_context, + return E2PFromSingleBox( self.mpole_expn_class(self.no_target_deriv_kernel, order), (self.kernel,)) @memoize_method def get_l2p(self, order): from sumpy import E2PFromSingleBox - return E2PFromSingleBox(self.cl_context, + return E2PFromSingleBox( self.local_expn_class(self.no_target_deriv_kernel, order), (self.kernel,)) @memoize_method def get_m2m(self, from_order, to_order): from sumpy import E2EFromCSR - return E2EFromCSR(self.cl_context, + return E2EFromCSR( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.mpole_expn_class(self.no_target_deriv_kernel, to_order)) @@ -205,14 +204,14 @@ def get_m2l(self, from_order, to_order): m2l_class = M2LUsingTranslationClassesDependentData else: m2l_class = E2EFromCSR - return m2l_class(self.cl_context, + return m2l_class( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @memoize_method def get_m2l_translation_class_dependent_data_kernel(self, from_order, to_order): from sumpy import M2LGenerateTranslationClassesDependentData - return M2LGenerateTranslationClassesDependentData(self.cl_context, + return M2LGenerateTranslationClassesDependentData( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @@ -225,21 +224,21 @@ def get_m2l_expansion_size(self, from_order, to_order): @memoize_method def get_m2l_preprocess_mpole_kernel(self, from_order, to_order): from sumpy import M2LPreprocessMultipole - return M2LPreprocessMultipole(self.cl_context, + return M2LPreprocessMultipole( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @memoize_method def get_m2l_postprocess_local_kernel(self, from_order, to_order): from sumpy import M2LPostprocessLocal - return M2LPostprocessLocal(self.cl_context, + return M2LPostprocessLocal( self.mpole_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @memoize_method def get_l2l(self, from_order, to_order): from sumpy import E2EFromCSR - return E2EFromCSR(self.cl_context, + return E2EFromCSR( self.local_expn_class(self.no_target_deriv_kernel, from_order), self.local_expn_class(self.no_target_deriv_kernel, to_order)) @@ -248,60 +247,57 @@ def get_l2l(self, from_order, to_order): # {{{ helpers -def _p2e(psource, center, rscale, order, p2e, expn_class, expn_kwargs): +def _p2e(actx, psource, center, rscale, order, p2e, expn_class, expn_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - source_boxes = cl.array.to_device( - queue, np.array([0], dtype=np.int32)) - box_source_starts = cl.array.to_device( - queue, np.array([0], dtype=np.int32)) - box_source_counts_nonchild = cl.array.to_device( - queue, np.array([psource.points.shape[-1]], dtype=np.int32)) + source_boxes = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_source_starts = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_source_counts_nonchild = actx.from_numpy( + np.array([psource.points.shape[-1]], dtype=np.int32)) center = np.asarray(center) - centers = cl.array.to_device( - queue, + centers = actx.from_numpy( np.array(center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1)) - evt, (coeffs,) = p2e(toy_ctx.queue, + coeffs, = p2e( + actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, centers=centers, - sources=cl.array.to_device(queue, psource.points), - strengths=(cl.array.to_device(queue, psource.weights),), + sources=actx.from_numpy(psource.points), + strengths=(actx.from_numpy(psource.weights),), rscale=rscale, nboxes=1, tgt_base_ibox=0, **toy_ctx.extra_source_and_kernel_kwargs) - return expn_class(toy_ctx, center, rscale, order, coeffs[0].get(queue), + return expn_class( + toy_ctx, center, rscale, order, + actx.to_numpy(coeffs[0]), derived_from=psource, **expn_kwargs) -def _e2p(psource, targets, e2p): +def _e2p(actx, psource, targets, e2p): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue ntargets = targets.shape[-1] - boxes = cl.array.to_device( - queue, np.array([0], dtype=np.int32)) - box_target_starts = cl.array.to_device( - queue, np.array([0], dtype=np.int32)) - box_target_counts_nonchild = cl.array.to_device( - queue, np.array([ntargets], dtype=np.int32)) - - centers = cl.array.to_device( - queue, + boxes = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_target_starts = actx.from_numpy( + np.array([0], dtype=np.int32)) + box_target_counts_nonchild = actx.from_numpy( + np.array([ntargets], dtype=np.int32)) + + centers = actx.from_numpy( np.array(psource.center, dtype=np.float64).reshape(toy_ctx.kernel.dim, 1)) from pytools.obj_array import make_obj_array - from sumpy.tools import vector_to_device - - coeffs = cl.array.to_device(queue, np.array([psource.coeffs])) - evt, (pot,) = e2p( - toy_ctx.queue, + coeffs = actx.from_numpy(np.array([psource.coeffs])) + pot, = e2p( + actx, src_expansions=coeffs, src_base_ibox=0, target_boxes=boxes, @@ -309,26 +305,25 @@ def _e2p(psource, targets, e2p): box_target_counts_nonchild=box_target_counts_nonchild, centers=centers, rscale=psource.rscale, - targets=vector_to_device(queue, make_obj_array(targets)), + targets=actx.from_numpy(make_obj_array(targets)), **toy_ctx.extra_kernel_kwargs) - return pot.get(queue) + return actx.to_numpy(pot) -def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, +def _e2e(actx: PyOpenCLArrayContext, + psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, extra_kernel_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - target_boxes = cl.array.to_device( - queue, np.array([1], dtype=np.int32)) - src_box_starts = cl.array.to_device( - queue, np.array([0, 1], dtype=np.int32)) - src_box_lists = cl.array.to_device( - queue, np.array([0], dtype=np.int32)) + target_boxes = actx.from_numpy( + np.array([1], dtype=np.int32)) + src_box_starts = actx.from_numpy( + np.array([0, 1], dtype=np.int32)) + src_box_lists = actx.from_numpy( + np.array([0], dtype=np.int32)) - centers = cl.array.to_device( - queue, + centers = actx.from_numpy( np.array( [ # box 0: source @@ -340,9 +335,9 @@ def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, dtype=np.float64).T.copy() ) - coeffs = cl.array.to_device(queue, np.array([psource.coeffs])) + coeffs = actx.from_numpy(np.array([psource.coeffs])) args = { - "queue": toy_ctx.queue, + "actx": actx, "src_expansions": coeffs, "src_base_ibox": 0, "tgt_base_ibox": 0, @@ -357,20 +352,19 @@ def _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, **toy_ctx.extra_kernel_kwargs, } - evt, (to_coeffs,) = e2e(**args) - + to_coeffs, = e2e(**args) return expn_class( - toy_ctx, to_center, to_rscale, to_order, to_coeffs[1].get(queue), + toy_ctx, to_center, to_rscale, to_order, + actx.to_numpy(to_coeffs[1]), derived_from=psource, **expn_kwargs) -def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, +def _m2l(actx: PyOpenCLArrayContext, + psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, translation_classes_kwargs): toy_ctx = psource.toy_ctx - queue = toy_ctx.queue - - coeffs = cl.array.to_device(queue, np.array([psource.coeffs])) + coeffs = actx.from_numpy(np.array([psource.coeffs])) m2l_use_translation_classes_dependent_data = \ toy_ctx.use_translation_classes_dependent_data() @@ -381,36 +375,36 @@ def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, expn_size = translation_classes_kwargs["m2l_expn_size"] # Preprocess the mpole expansion - preprocessed_src_expansions = cl.array.zeros(queue, (1, expn_size), - dtype=np.complex128) - evt, _ = preprocess_kernel(queue, + preprocessed_src_expansions = actx.zeros((1, expn_size), dtype=np.complex128) + preprocess_kernel( + actx, src_expansions=coeffs, preprocessed_src_expansions=preprocessed_src_expansions, src_rscale=np.float64(psource.rscale), **toy_ctx.extra_kernel_kwargs) if toy_ctx.use_fft: - from sumpy.tools import (run_opencl_fft, get_opencl_fft_app, - get_native_event) - fft_app = get_opencl_fft_app(queue, (expn_size,), + from sumpy.tools import get_opencl_fft_app, run_opencl_fft + + fft_app = get_opencl_fft_app(actx, (expn_size,), dtype=preprocessed_src_expansions.dtype, inverse=False) - ifft_app = get_opencl_fft_app(queue, (expn_size,), + ifft_app = get_opencl_fft_app(actx, (expn_size,), dtype=preprocessed_src_expansions.dtype, inverse=True) - evt, preprocessed_src_expansions = run_opencl_fft(fft_app, queue, - preprocessed_src_expansions, inverse=False, wait_for=[evt]) + preprocessed_src_expansions = run_opencl_fft(actx, fft_app, + preprocessed_src_expansions, inverse=False) # Compute translation classes data - m2l_translation_classes_lists = cl.array.to_device(queue, - np.array([0], dtype=np.int32)) + m2l_translation_classes_lists = ( + actx.from_numpy(np.array([0], dtype=np.int32))) dist = np.array(to_center - psource.center, dtype=np.float64) dim = toy_ctx.kernel.dim - m2l_translation_vectors = cl.array.to_device(queue, dist.reshape(dim, 1)) - m2l_translation_classes_dependent_data = cl.array.zeros(queue, - (1, expn_size), dtype=np.complex128) + m2l_translation_vectors = actx.from_numpy(dist.reshape(dim, 1)) + m2l_translation_classes_dependent_data = ( + actx.zeros((1, expn_size), dtype=np.complex128)) - evt, _ = data_kernel( - queue, + data_kernel( + actx, src_rscale=np.float64(psource.rscale), ntranslation_classes=1, translation_classes_level_start=0, @@ -418,15 +412,15 @@ def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, m2l_translation_classes_dependent_data=( m2l_translation_classes_dependent_data), ntranslation_vectors=1, - wait_for=[get_native_event(evt)], **toy_ctx.extra_kernel_kwargs) if toy_ctx.use_fft: - evt, m2l_translation_classes_dependent_data = run_opencl_fft(fft_app, - queue, m2l_translation_classes_dependent_data, inverse=False, - wait_for=[evt]) + m2l_translation_classes_dependent_data = run_opencl_fft( + actx, fft_app, + m2l_translation_classes_dependent_data, + inverse=False) - ret = _e2e(psource, to_center, to_rscale, to_order, + ret = _e2e(actx, psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, { "src_expansions": preprocessed_src_expansions, @@ -438,28 +432,32 @@ def _m2l(psource, to_center, to_rscale, to_order, e2e, expn_class, expn_kwargs, ) # Postprocess the local expansion - local_before = cl.array.to_device(queue, np.array([ret.coeffs])) - to_coeffs = cl.array.zeros(queue, (1, len(data_kernel.tgt_expansion)), - dtype=coeffs.dtype) + local_before = actx.from_numpy(np.array([ret.coeffs])) + to_coeffs = actx.zeros((1, len(data_kernel.tgt_expansion)), + dtype=coeffs.dtype) if toy_ctx.use_fft: - evt, local_before = run_opencl_fft(ifft_app, queue, - local_before, inverse=True, wait_for=[get_native_event(evt)]) + evt, local_before = run_opencl_fft( + actx, ifft_app, + local_before, inverse=True) - evt, _ = postprocess_kernel(queue=queue, + postprocess_kernel( + actx, tgt_expansions_before_postprocessing=local_before, tgt_expansions=to_coeffs, src_rscale=np.float64(psource.rscale), tgt_rscale=np.float64(to_rscale), - wait_for=[get_native_event(evt)], **toy_ctx.extra_kernel_kwargs) return expn_class( - toy_ctx, to_center, to_rscale, to_order, to_coeffs.get(queue)[0], + toy_ctx, to_center, to_rscale, to_order, + actx.to_numpy(to_coeffs)[0], derived_from=psource, **expn_kwargs) else: - ret = _e2e(psource, to_center, to_rscale, to_order, e2e, expn_class, - expn_kwargs, {}) + ret = _e2e( + actx, + psource, to_center, to_rscale, to_order, e2e, expn_class, + expn_kwargs, {}) return ret @@ -488,7 +486,7 @@ class PotentialSource: def __init__(self, toy_ctx: ToyContext): self.toy_ctx = toy_ctx - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: """ :param targets: An array of shape ``(dim, ntargets)``. :returns: an array of shape ``(ntargets,)``. @@ -513,8 +511,7 @@ def __sub__(self, other: Union[Number, np.number, PotentialSource]) -> PotentialSource: return self.__add__(-other) - def __rsub__(self, - other: Union[Number, np.number, PotentialSource] + def __rsub__(self, other: Union[Number, np.number, PotentialSource] ) -> PotentialSource: return (-self).__add__(other) @@ -541,7 +538,7 @@ def __init__(self, toy_ctx: ToyContext, value): super().__init__(toy_ctx) self.value = np.array(value) - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: pot = np.empty(targets.shape[-1], dtype=self.value.dtype) pot.fill(self.value) return pot @@ -555,13 +552,14 @@ class OneOnBallPotential(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx: ToyContext, center: np.ndarray, radius: float) -> None: super().__init__(toy_ctx) self.center = np.asarray(center) self.radius = radius - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: dist_vec = targets - self.center[:, np.newaxis] return (np.sum(dist_vec**2, axis=0) < self.radius**2).astype(np.float64) @@ -572,6 +570,7 @@ class HalfspaceOnePotential(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx: ToyContext, center: np.ndarray, axis: int, side: int = 1) -> None: super().__init__(toy_ctx) @@ -579,7 +578,7 @@ def __init__(self, toy_ctx: ToyContext, center: np.ndarray, self.axis = axis self.side = side - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: return ( (self.side*(targets[self.axis] - self.center[self.axis])) >= 0 ).astype(np.float64) @@ -605,16 +604,15 @@ def __init__(self, self.weights = weights self._center = center - def eval(self, targets: np.ndarray) -> np.ndarray: - queue = self.toy_ctx.queue - evt, (potential,) = self.toy_ctx.get_p2p()( - queue, - cl.array.to_device(queue, targets), - cl.array.to_device(queue, self.points), - [cl.array.to_device(queue, self.weights)], + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: + potential, = self.toy_ctx.get_p2p()( + actx, + actx.from_numpy(targets), + actx.from_numpy(self.points), + [actx.from_numpy(self.weights)], **self.toy_ctx.extra_source_and_kernel_kwargs) - return potential.get(queue) + return actx.to_numpy(potential) @property def center(self): @@ -640,6 +638,7 @@ class ExpansionPotentialSource(PotentialSource): .. automethod:: __init__ """ + def __init__(self, toy_ctx, center, rscale, order, coeffs, derived_from, radius=None, expn_style=None, text_kwargs=None): super().__init__(toy_ctx) @@ -664,8 +663,8 @@ class MultipoleExpansion(ExpansionPotentialSource): Inherits from :class:`ExpansionPotentialSource`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: - return _e2p(self, targets, self.toy_ctx.get_m2p(self.order)) + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: + return _e2p(actx, self, targets, self.toy_ctx.get_m2p(self.order)) class LocalExpansion(ExpansionPotentialSource): @@ -673,8 +672,8 @@ class LocalExpansion(ExpansionPotentialSource): Inherits from :class:`ExpansionPotentialSource`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: - return _e2p(self, targets, self.toy_ctx.get_l2p(self.order)) + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: + return _e2p(actx, self, targets, self.toy_ctx.get_l2p(self.order)) class PotentialExpressionNode(PotentialSource): @@ -707,10 +706,10 @@ class Sum(PotentialExpressionNode): Inherits from :class:`PotentialExpressionNode`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: result = 0 for psource in self.psources: - result = result + psource.eval(targets) + result = result + psource.eval(actx, targets) return result @@ -720,31 +719,36 @@ class Product(PotentialExpressionNode): Inherits from :class:`PotentialExpressionNode`. """ - def eval(self, targets: np.ndarray) -> np.ndarray: + def eval(self, actx: PyOpenCLArrayContext, targets: np.ndarray) -> np.ndarray: result = 1 for psource in self.psources: - result = result * psource.eval(targets) + result = result * psource.eval(actx, targets) return result # }}} -def multipole_expand(psource: PotentialSource, - center: np.ndarray, order: Optional[int] = None, - rscale: float = 1, **expn_kwargs) -> MultipoleExpansion: +def multipole_expand( + actx: PyOpenCLArrayContext, + psource: PotentialSource, + center: np.ndarray, *, + order: Optional[int] = None, + rscale: float = 1, + **expn_kwargs: Any) -> MultipoleExpansion: if isinstance(psource, PointSources): if order is None: raise ValueError("order may not be None") - return _p2e(psource, center, rscale, order, psource.toy_ctx.get_p2m(order), + return _p2e(actx, + psource, center, rscale, order, psource.toy_ctx.get_p2m(order), MultipoleExpansion, expn_kwargs) elif isinstance(psource, MultipoleExpansion): if order is None: order = psource.order - return _e2e(psource, center, rscale, order, + return _e2e(actx, psource, center, rscale, order, psource.toy_ctx.get_m2m(psource.order, order), MultipoleExpansion, expn_kwargs, {}) @@ -753,14 +757,18 @@ def multipole_expand(psource: PotentialSource, def local_expand( + actx: PyOpenCLArrayContext, psource: PotentialSource, - center: np.ndarray, order: Optional[int] = None, rscale: Number = 1, - **expn_kwargs) -> LocalExpansion: + center: np.ndarray, *, + order: Optional[int] = None, + rscale: float = 1, + **expn_kwargs: Any) -> LocalExpansion: if isinstance(psource, PointSources): if order is None: raise ValueError("order may not be None") - return _p2e(psource, center, rscale, order, psource.toy_ctx.get_p2l(order), + return _p2e(actx, + psource, center, rscale, order, psource.toy_ctx.get_p2l(order), LocalExpansion, expn_kwargs) elif isinstance(psource, MultipoleExpansion): @@ -785,7 +793,7 @@ def local_expand( translation_classes_kwargs["m2l_expn_size"] = \ toy_ctx.get_m2l_expansion_size(psource.order, order) - return _m2l(psource, center, rscale, order, + return _m2l(actx, psource, center, rscale, order, toy_ctx.get_m2l(psource.order, order), LocalExpansion, expn_kwargs, translation_classes_kwargs) @@ -794,7 +802,7 @@ def local_expand( if order is None: order = psource.order - return _e2e(psource, center, rscale, order, + return _e2e(actx, psource, center, rscale, order, psource.toy_ctx.get_l2l(psource.order, order), LocalExpansion, expn_kwargs, {}) @@ -802,9 +810,12 @@ def local_expand( raise TypeError(f"do not know how to expand '{type(psource).__name__}'") -def logplot(fp: FieldPlotter, psource: PotentialSource, **kwargs) -> None: +def logplot( + actx: PyOpenCLArrayContext, + fp: "FieldPlotter", + psource: PotentialSource, **kwargs) -> None: fp.show_scalar_in_matplotlib( - np.log10(np.abs(psource.eval(fp.points) + 1e-15)), **kwargs) + np.log10(np.abs(psource.eval(actx, fp.points) + 1e-15)), **kwargs) def combine_inner_outer( @@ -852,7 +863,7 @@ def combine_halfspace_and_outer( psource_outer, radius, center) -def l_inf(psource: PotentialSource, radius: float, +def l_inf(actx: PyOpenCLArrayContext, psource: PotentialSource, radius: float, center: Optional[np.ndarray] = None, npoints: int = 100, debug: bool = False) -> np.number: if center is None: @@ -862,7 +873,7 @@ def l_inf(psource: PotentialSource, radius: float, from sumpy.visualization import FieldPlotter fp = FieldPlotter(center, extent=2*radius, npoints=npoints) - z = restr.eval(fp.points) + z = restr.eval(actx, fp.points) if debug: fp.show_scalar_in_matplotlib( @@ -877,8 +888,8 @@ def l_inf(psource: PotentialSource, radius: float, # {{{ schematic visualization def draw_box(el, eh, **kwargs): - import matplotlib.pyplot as pt import matplotlib.patches as mpatches + import matplotlib.pyplot as pt from matplotlib.path import Path pathdata = [ diff --git a/test/test_distributed.py b/test/test_distributed.py index dd21b224..e436fca3 100644 --- a/test/test_distributed.py +++ b/test/test_distributed.py @@ -21,10 +21,21 @@ """ import os +import pytest from functools import partial -import pyopencl as cl + import numpy as np -import pytest + +from arraycontext import pytest_generate_tests_for_array_contexts +from sumpy.array_context import ( # noqa: F401 + PytestPyOpenCLArrayContextFactory, _acf) + +import logging +logger = logging.getLogger(__name__) + +pytest_generate_tests = pytest_generate_tests_for_array_contexts([ + PytestPyOpenCLArrayContextFactory, + ]) # Note: Do not import mpi4py.MPI object at the module level, because OpenMPI does not # support recursive invocations. @@ -51,14 +62,13 @@ def _test_against_single_rank( set_cache_dir(mpi_rank) # Configure array context - cl_context = cl.create_some_context() - queue = cl.CommandQueue(cl_context) + actx = _acf() def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): return max(level, 3) from boxtree.traversal import FMMTraversalBuilder - traversal_builder = FMMTraversalBuilder(cl_context, well_sep_is_n_away=2) + traversal_builder = FMMTraversalBuilder(actx, well_sep_is_n_away=2) from sumpy.kernel import LaplaceKernel from sumpy.expansion import DefaultExpansionFactory @@ -72,34 +82,30 @@ def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): from sumpy.fmm import SumpyTreeIndependentDataForWrangler tree_indep = SumpyTreeIndependentDataForWrangler( - cl_context, multipole_expansion_factory, local_expansion_factory, [kernel]) + actx, multipole_expansion_factory, local_expansion_factory, [kernel]) global_tree_dev = None - sources_weights = cl.array.empty(queue, 0, dtype=dtype) + sources_weights = actx.empty(0, dtype=dtype) if mpi_rank == 0: # Generate random particles and source weights from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(queue, nsources, dims, dtype, seed=15) - targets = p_normal(queue, ntargets, dims, dtype, seed=18) + sources = p_normal(actx, nsources, dims, dtype, seed=15) + targets = p_normal(actx, ntargets, dims, dtype, seed=18) # FIXME: Use arraycontext instead of raw PyOpenCL arrays - from pyopencl.clrandom import PhiloxGenerator - rng = PhiloxGenerator(cl_context, seed=20) - sources_weights = rng.uniform(queue, nsources, dtype=np.float64) - - rng = PhiloxGenerator(cl_context, seed=22) - target_radii = rng.uniform( - queue, ntargets, a=0, b=0.05, dtype=np.float64) + rng = np.random.default_rng(20) + sources_weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) + target_radii = actx.from_numpy(0.05 * rng.random(ntargets, dtype=np.float64)) # Build the tree and interaction lists from boxtree import TreeBuilder - tb = TreeBuilder(cl_context) + tb = TreeBuilder(actx) global_tree_dev, _ = tb( - queue, sources, targets=targets, target_radii=target_radii, + actx, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.25, max_particles_in_box=30, debug=True) - global_trav_dev, _ = traversal_builder(queue, global_tree_dev, debug=True) + global_trav_dev, _ = traversal_builder(actx, global_tree_dev, debug=True) from sumpy.fmm import SumpyExpansionWrangler wrangler = SumpyExpansionWrangler(tree_indep, global_trav_dev, dtype, @@ -107,32 +113,29 @@ def fmm_level_to_order(base_kernel, kernel_arg_set, tree, level): # Compute FMM with one MPI rank from boxtree.fmm import drive_fmm - shmem_potential = drive_fmm(wrangler, [sources_weights]) + shmem_potential = drive_fmm(actx, wrangler, [sources_weights]) # Compute FMM using the distributed implementation def wrangler_factory(local_traversal, global_traversal): from sumpy.distributed import DistributedSumpyExpansionWrangler return DistributedSumpyExpansionWrangler( - cl_context, comm, tree_indep, local_traversal, global_traversal, dtype, + actx, comm, tree_indep, local_traversal, global_traversal, dtype, fmm_level_to_order, communicate_mpoles_via_allreduce=communicate_mpoles_via_allreduce) from boxtree.distributed import DistributedFMMRunner distribued_fmm_info = DistributedFMMRunner( - queue, global_tree_dev, traversal_builder, wrangler_factory, comm=comm) + actx, global_tree_dev, traversal_builder, wrangler_factory, comm=comm) - timing_data = {} - distributed_potential = distribued_fmm_info.drive_dfmm( - [sources_weights], timing_data=timing_data) - assert timing_data + distributed_potential = distribued_fmm_info.drive_dfmm(actx, [sources_weights]) if mpi_rank == 0: assert shmem_potential.shape == (1,) assert distributed_potential.shape == (1,) - shmem_potential = shmem_potential[0].get() - distributed_potential = distributed_potential[0].get() + shmem_potential = actx.to_numpy(shmem_potential[0]) + distributed_potential = actx.to_numpy(distributed_potential[0]) error = (np.linalg.norm(distributed_potential - shmem_potential, ord=np.inf) / np.linalg.norm(shmem_potential, ord=np.inf)) diff --git a/test/test_fmm.py b/test/test_fmm.py index cf0c6a23..f9b7715e 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -132,12 +132,12 @@ def _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class, dtype = np.float64 from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) if 1: offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) del offset else: @@ -148,20 +148,19 @@ def _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class, targets = make_obj_array([fp.points[i] for i in range(knl.dim)]) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) # {{{ plot tree if 0: - host_tree = tree.get(actx.queue) - host_trav = trav.get(actx.queue) + host_tree = actx.to_numpy(tree) + host_trav = actx.to_numpy(trav) if 0: logger.info("src_box: %s", host_tree.find_box_nr_for_source(403)) @@ -223,7 +222,7 @@ def _test_sumpy_fmm(actx_factory, knl, local_expn_class, mpole_expn_class, knl, local_expn_class)() tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl, m2l_translation=m2l_translation), target_kernels) @@ -242,12 +241,11 @@ def fmm_level_to_order(kernel, kernel_args, tree, lev): from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) from sumpy import P2P - p2p = P2P(actx.context, target_kernels, exclude_self=False) - evt, (ref_pot,) = p2p(actx.queue, targets, sources, (weights,), - **extra_kwargs) + p2p = P2P(target_kernels, exclude_self=False) + ref_pot, = p2p(actx, targets, sources, (weights,), **extra_kwargs) pot = actx.to_numpy(pot) ref_pot = actx.to_numpy(ref_pot) @@ -281,21 +279,19 @@ def test_coeff_magnitude_rscale(actx_factory, knl): from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(31) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -314,7 +310,7 @@ def test_coeff_magnitude_rscale(actx_factory, knl): target_kernels = [knl] tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels) @@ -327,9 +323,10 @@ def fmm_level_to_order(kernel, kernel_args, tree, lev): kernel_extra_kwargs=extra_kwargs) weights = wrangler.reorder_sources(weights) - (weights,) = wrangler.distribute_source_weights((weights,), None) + (weights,) = wrangler.distribute_source_weights(actx, (weights,), None) - local_result, _ = wrangler.form_locals( + local_result = wrangler.form_locals( + actx, trav.level_start_target_or_target_parent_box_nrs, trav.target_or_target_parent_boxes, trav.from_sep_bigger_starts, @@ -369,22 +366,20 @@ def test_unified_single_and_double(actx_factory, visualize=False): from boxtree.tools import make_normal_particle_array as p_normal - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) offset = np.zeros(knl.dim) offset[0] = 0.1 - targets = offset + p_normal(actx.queue, ntargets, knl.dim, dtype, seed=18) + targets = offset + p_normal(actx, ntargets, knl.dim, dtype, seed=18) del offset from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, targets=targets, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, targets=targets, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = ( @@ -412,9 +407,9 @@ def test_unified_single_and_double(actx_factory, visualize=False): for source_kernels, strength_usage in zip(source_kernel_vecs, strength_usages): source_extra_kwargs = {} if deriv_knl in source_kernels: - source_extra_kwargs["dir_vec"] = dir_vec + source_extra_kwargs["dir_vec"] = actx.from_numpy(dir_vec) tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=target_kernels, source_kernels=source_kernels, @@ -425,7 +420,7 @@ def test_unified_single_and_double(actx_factory, visualize=False): from boxtree.fmm import drive_fmm - pot = drive_fmm(wrangler, weights) + pot = drive_fmm(actx, wrangler, weights) results.append(np.array([actx.to_numpy(pot[0]), actx.to_numpy(pot[1])])) ref_pot = results[0] + results[1] @@ -462,16 +457,15 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) mpole_expn_class = VolumeTaylorMultipoleExpansion order = 1 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -489,7 +483,7 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) knl, local_expn_class)() tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl, m2l_translation=m2l_translation), target_kernels) @@ -498,11 +492,7 @@ def test_sumpy_fmm_timing_data_collection(ctx_factory, use_fft, visualize=False) fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order) from boxtree.fmm import drive_fmm - timing_data = {} - pot, = drive_fmm(wrangler, (weights,), timing_data=timing_data) - logger.info("timing_data:\n%s", timing_data) - - assert timing_data + pot, = drive_fmm(actx, wrangler, (weights,)) def test_sumpy_fmm_exclude_self(actx_factory, visualize=False): @@ -521,16 +511,16 @@ def test_sumpy_fmm_exclude_self(actx_factory, visualize=False): mpole_expn_class = VolumeTaylorMultipoleExpansion order = 10 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, max_particles_in_box=30, debug=True) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(44) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -541,7 +531,7 @@ def test_sumpy_fmm_exclude_self(actx_factory, visualize=False): target_kernels = [knl] tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels, @@ -553,12 +543,11 @@ def test_sumpy_fmm_exclude_self(actx_factory, visualize=False): from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) from sumpy import P2P - p2p = P2P(actx.context, target_kernels, exclude_self=True) - evt, (ref_pot,) = p2p(actx.queue, sources, sources, (weights,), - **self_extra_kwargs) + p2p = P2P(target_kernels, exclude_self=True) + ref_pot, = p2p(actx, sources, sources, (weights,), **self_extra_kwargs) pot = actx.to_numpy(pot) ref_pot = actx.to_numpy(ref_pot) @@ -589,17 +578,15 @@ def test_sumpy_axis_source_derivative(actx_factory, visualize=False): mpole_expn_class = VolumeTaylorMultipoleExpansion order = 10 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) - - tree, _ = tb(actx.queue, sources, - max_particles_in_box=30, debug=True) + tb = TreeBuilder(actx) + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(12) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -614,7 +601,7 @@ def test_sumpy_axis_source_derivative(actx_factory, visualize=False): (AxisTargetDerivative(0, knl), knl), (knl, AxisSourceDerivative(0, knl))]: tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=[tgt_knl], @@ -627,7 +614,7 @@ def test_sumpy_axis_source_derivative(actx_factory, visualize=False): from boxtree.fmm import drive_fmm - pot, = drive_fmm(wrangler, (weights,)) + pot, = drive_fmm(actx, wrangler, (weights,)) pots.append(actx.to_numpy(pot)) rel_err = la.norm(pots[0] + pots[1]) / la.norm(pots[0]) @@ -657,17 +644,17 @@ def test_sumpy_target_point_multiplier(actx_factory, deriv_axes, visualize=False mpole_expn_class = VolumeTaylorMultipoleExpansion order = 5 - sources = p_normal(actx.queue, nsources, knl.dim, dtype, seed=15) + sources = p_normal(actx, nsources, knl.dim, dtype, seed=15) from boxtree import TreeBuilder - tb = TreeBuilder(actx.context) + tb = TreeBuilder(actx) - tree, _ = tb(actx.queue, sources, + tree, _ = tb(actx, sources, max_particles_in_box=30, debug=True) from boxtree.traversal import FMMTraversalBuilder - tbuild = FMMTraversalBuilder(actx.context) - trav, _ = tbuild(actx.queue, tree, debug=True) + tbuild = FMMTraversalBuilder(actx) + trav, _ = tbuild(actx, tree, debug=True) rng = np.random.default_rng(12) weights = actx.from_numpy(rng.random(nsources, dtype=np.float64)) @@ -683,7 +670,7 @@ def test_sumpy_target_point_multiplier(actx_factory, deriv_axes, visualize=False tgt_knls[1] = AxisTargetDerivative(axis, tgt_knls[1]) tree_indep = SumpyTreeIndependentDataForWrangler( - actx.context, + actx, partial(mpole_expn_class, knl), partial(local_expn_class, knl), target_kernels=tgt_knls, @@ -696,7 +683,7 @@ def test_sumpy_target_point_multiplier(actx_factory, deriv_axes, visualize=False from boxtree.fmm import drive_fmm - pot0, pot1, pot2 = drive_fmm(wrangler, (weights,)) + pot0, pot1, pot2 = drive_fmm(actx, wrangler, (weights,)) pot0, pot1, pot2 = actx.to_numpy(pot0), actx.to_numpy(pot1), actx.to_numpy(pot2) if deriv_axes == (0,): ref_pot = pot1 * actx.to_numpy(sources[0]) + pot2 diff --git a/test/test_kernels.py b/test/test_kernels.py index 8f0a98ef..3d2babae 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -20,8 +20,8 @@ THE SOFTWARE. """ -import pytest import sys +import pytest from functools import partial import numpy as np @@ -74,9 +74,7 @@ def test_p2p(actx_factory, exclude_self): from sumpy.p2p import P2P lknl = LaplaceKernel(dimensions) - knl = P2P(actx.context, - [lknl, AxisTargetDerivative(0, lknl)], - exclude_self=exclude_self) + knl = P2P([lknl, AxisTargetDerivative(0, lknl)], exclude_self=exclude_self) rng = np.random.default_rng(42) targets = rng.random(size=(dimensions, n)) @@ -89,8 +87,8 @@ def test_p2p(actx_factory, exclude_self): extra_kwargs["target_to_source"] = ( actx.from_numpy(np.arange(n, dtype=np.int32))) - evt, (potential, x_derivative) = knl( - actx.queue, + potential, _ = knl( + actx, actx.from_numpy(targets), actx.from_numpy(sources), [actx.from_numpy(strengths)], @@ -184,11 +182,11 @@ def test_p2e_multiple(actx_factory, base_knl, expn_class): rscale = 0.5 # pick something non-1 # apply p2e at the same time - p2e = P2EFromSingleBox(actx.context, expn, + p2e = P2EFromSingleBox(expn, kernels=source_kernels, strength_usage=[0, 1]) - evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -200,7 +198,6 @@ def test_p2e_multiple(actx_factory, base_knl, expn_class): rscale=rscale, dir_vec=dir_vec, **extra_kwargs) - actual_result = actx.to_numpy(mpoles) # apply p2e separately @@ -210,10 +207,10 @@ def test_p2e_multiple(actx_factory, base_knl, expn_class): if isinstance(source_kernel, DirectionalSourceDerivative): extra_source_kwargs["dir_vec"] = dir_vec - p2e = P2EFromSingleBox(actx.context, expn, + p2e = P2EFromSingleBox(expn, kernels=[source_kernel], strength_usage=[i]) - evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -295,9 +292,9 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative expn = expn_class(knl, order=order) from sumpy import P2EFromSingleBox, E2PFromSingleBox, P2P - p2e = P2EFromSingleBox(actx.context, expn, kernels=[knl]) - e2p = E2PFromSingleBox(actx.context, expn, kernels=target_kernels) - p2p = P2P(actx.context, target_kernels, exclude_self=False) + p2e = P2EFromSingleBox(expn, kernels=[knl]) + e2p = E2PFromSingleBox(expn, kernels=target_kernels) + p2p = P2P(target_kernels, exclude_self=False) from pytools.convergence import EOCRecorder eoc_rec_pot = EOCRecorder() @@ -349,7 +346,7 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative # {{{ apply p2e - evt, (mpoles,) = p2e(actx.queue, + mpoles = p2e(actx, source_boxes=source_boxes, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, @@ -371,8 +368,8 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative box_target_counts_nonchild = ( actx.from_numpy(np.array([ntargets], dtype=np.int32))) - evt, (pot, grad_x, ) = e2p( - actx.queue, + pot, grad_x = e2p( + actx, src_expansions=mpoles, src_base_ibox=0, target_boxes=source_boxes, @@ -389,8 +386,8 @@ def test_p2e2p(actx_factory, base_knl, expn_class, order, with_source_derivative # {{{ compute (direct) reference solution - evt, (pot_direct, grad_x_direct, ) = p2p( - actx.queue, + pot_direct, grad_x_direct = p2p( + actx, targets, sources, (strengths,), **extra_source_kwargs) pot_direct = actx.to_numpy(pot_direct) @@ -566,7 +563,6 @@ def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class, m2l_translation = m2l_factory.get_m2l_translation_class(knl, local_expn_class)() toy_ctx = t.ToyContext( - actx.context, kernel=knl, local_expn_class=partial(local_expn_class, m2l_translation=m2l_translation), @@ -575,7 +571,7 @@ def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class, ) p = t.PointSources(toy_ctx, sources, weights=strengths) - p2p = p.eval(targets) + p2p = p.eval(actx, targets) m1_rscale = 0.5 m2_rscale = 0.25 @@ -583,27 +579,32 @@ def test_translations(actx_factory, knl, local_expn_class, mpole_expn_class, l2_rscale = 0.25 for order in orders: - print(centers[:, 0].shape) - p2m = t.multipole_expand(p, centers[:, 0], order, m1_rscale) - p2m2p = p2m.eval(targets) + logger.info("Centers: %s", centers[:, 0].shape) + + p2m = t.multipole_expand(actx, p, centers[:, 0], + order=order, rscale=m1_rscale) + p2m2p = p2m.eval(actx, targets) err = la.norm((p2m2p - p2p) / res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2p.add_data_point(order, err) - p2m2m = t.multipole_expand(p2m, centers[:, 1], order, m2_rscale) - p2m2m2p = p2m2m.eval(targets) + p2m2m = t.multipole_expand(actx, p2m, centers[:, 1], + order=order, rscale=m2_rscale) + p2m2m2p = p2m2m.eval(actx, targets) err = la.norm((p2m2m2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2m2p.add_data_point(order, err) - p2m2m2l = t.local_expand(p2m2m, centers[:, 2], order, l1_rscale) - p2m2m2l2p = p2m2m2l.eval(targets) + p2m2m2l = t.local_expand(actx, p2m2m, centers[:, 2], + order=order, rscale=l1_rscale) + p2m2m2l2p = p2m2m2l.eval(actx, targets) err = la.norm((p2m2m2l2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_p2m2m2l2p.add_data_point(order, err) - p2m2m2l2l = t.local_expand(p2m2m2l, centers[:, 3], order, l2_rscale) - p2m2m2l2l2p = p2m2m2l2l.eval(targets) + p2m2m2l2l = t.local_expand(actx, p2m2m2l, centers[:, 3], + order=order, rscale=l2_rscale) + p2m2m2l2l2p = p2m2m2l2l.eval(actx, targets) err = la.norm((p2m2m2l2l2p - p2p)/res**2) err = err / (la.norm(p2p) / res**2) pconv_verifier_full.add_data_point(order, err) @@ -796,7 +797,6 @@ def test_m2m_compressed_error_helmholtz(actx_factory, dim, order): for i, (mpole_expn_class, local_expn_class) in \ enumerate(zip(mpole_expn_classes, local_expn_classes)): tctx = toys.ToyContext( - actx.context, knl, extra_kernel_kwargs=extra_kernel_kwargs, local_expn_class=local_expn_class, @@ -808,15 +808,15 @@ def test_m2m_compressed_error_helmholtz(actx_factory, dim, order): np.ones(sources.shape[-1]) ) - mexp = toys.multipole_expand(pt_src, + mexp = toys.multipole_expand(actx, pt_src, center=mpole_center.reshape(dim), order=order, rscale=h) - mexp2 = toys.multipole_expand(mexp, + mexp2 = toys.multipole_expand(actx, mexp, center=second_center.reshape(dim), order=order, rscale=h) - m2m_vals[i] = mexp2.eval(targets) + m2m_vals[i] = mexp2.eval(actx, targets) err = np.linalg.norm(m2m_vals[1] - m2m_vals[0]) \ / np.linalg.norm(m2m_vals[1]) diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index 4baf3031..08d6e12d 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -70,9 +70,9 @@ def _build_subset_indices(actx, ntargets, nsources, factor): rng = np.random.default_rng() if abs(factor - 1.0) > 1.0e-14: tgtindices = rng.choice(tgtindices, - size=int(factor * ntargets), replace=False) + size=int(factor * ntargets), replace=False) srcindices = rng.choice(srcindices, - size=int(factor * nsources), replace=False) + size=int(factor * nsources), replace=False) else: rng.shuffle(tgtindices) rng.shuffle(srcindices) @@ -111,60 +111,62 @@ def test_qbx_direct(actx_factory, factor, lpot_id, visualize=False): expn = LineTaylorLocalExpansion(knl, order) from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, expansion=expn, source_kernels=(knl,), - target_kernels=(base_knl,)) + lpot = LayerPotential( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) from sumpy.qbx import LayerPotentialMatrixGenerator - mat_gen = LayerPotentialMatrixGenerator(actx.context, - expansion=expn, - source_kernels=(knl,), - target_kernels=(base_knl,)) + mat_gen = LayerPotentialMatrixGenerator( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) from sumpy.qbx import LayerPotentialMatrixSubsetGenerator - blk_gen = LayerPotentialMatrixSubsetGenerator(actx.context, - expansion=expn, - source_kernels=(knl,), - target_kernels=(base_knl,)) + blk_gen = LayerPotentialMatrixSubsetGenerator( + expansion=expn, + source_kernels=(knl,), + target_kernels=(base_knl,)) for n in [200, 300, 400]: - targets, sources, centers, expansion_radii, sigma = \ - _build_geometry(actx, n, n, mode_nr, target_radius=1.2) + targets, sources, centers, expansion_radii, sigma = ( + _build_geometry(actx, n, n, mode_nr, target_radius=1.2)) h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices, srcindices = _build_subset_indices(actx, - ntargets=n, nsources=n, factor=factor) + tgtindices, srcindices = ( + _build_subset_indices(actx, ntargets=n, nsources=n, factor=factor)) extra_kwargs = {} if lpot_id == 2: from pytools.obj_array import make_obj_array extra_kwargs["dsource_vec"] = ( - actx.from_numpy(make_obj_array(np.ones((ndim, n)))) - ) + actx.from_numpy(make_obj_array(np.ones((ndim, n)))) + ) - _, (result_lpot,) = lpot(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, - strengths=strengths, **extra_kwargs) + result_lpot, = lpot(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + strengths=strengths, **extra_kwargs) result_lpot = actx.to_numpy(result_lpot) - _, (mat,) = mat_gen(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, **extra_kwargs) + mat, = mat_gen(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, **extra_kwargs) mat = actx.to_numpy(mat) result_mat = mat @ actx.to_numpy(strengths[0]) - _, (blk,) = blk_gen(actx.queue, - targets=targets, - sources=sources, - centers=centers, - expansion_radii=expansion_radii, - tgtindices=tgtindices, - srcindices=srcindices, **extra_kwargs) + blk, = blk_gen(actx, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + tgtindices=tgtindices, + srcindices=srcindices, **extra_kwargs) blk = actx.to_numpy(blk) tgtindices = actx.to_numpy(tgtindices) @@ -201,14 +203,15 @@ def test_p2p_direct(actx_factory, exclude_self, factor, lpot_id, visualize=False raise ValueError(f"unknown lpot_id: '{lpot_id}'") from sumpy.p2p import P2P - lpot = P2P(actx.context, [lknl], exclude_self=exclude_self) + lpot = P2P(target_kernels=[lknl], exclude_self=exclude_self) from sumpy.p2p import P2PMatrixGenerator - mat_gen = P2PMatrixGenerator(actx.context, [lknl], exclude_self=exclude_self) + mat_gen = P2PMatrixGenerator( + target_kernels=[lknl], exclude_self=exclude_self) from sumpy.p2p import P2PMatrixSubsetGenerator blk_gen = P2PMatrixSubsetGenerator( - actx.context, [lknl], exclude_self=exclude_self) + target_kernels=[lknl], exclude_self=exclude_self) for n in [200, 300, 400]: targets, sources, _, _, sigma = ( @@ -216,8 +219,8 @@ def test_p2p_direct(actx_factory, exclude_self, factor, lpot_id, visualize=False h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices, srcindices = _build_subset_indices(actx, - ntargets=n, nsources=n, factor=factor) + tgtindices, srcindices = ( + _build_subset_indices(actx, ntargets=n, nsources=n, factor=factor)) extra_kwargs = {} if exclude_self: @@ -227,25 +230,25 @@ def test_p2p_direct(actx_factory, exclude_self, factor, lpot_id, visualize=False if lpot_id == 2: from pytools.obj_array import make_obj_array extra_kwargs["dsource_vec"] = ( - actx.from_numpy(make_obj_array(np.ones((ndim, n))))) + actx.from_numpy(make_obj_array(np.ones((ndim, n))))) - _, (result_lpot,) = lpot(actx.queue, + result_lpot, = lpot(actx, targets=targets, sources=sources, strength=strengths, **extra_kwargs) result_lpot = actx.to_numpy(result_lpot) - _, (mat,) = mat_gen(actx.queue, - targets=targets, - sources=sources, **extra_kwargs) + mat, = mat_gen(actx, + targets=targets, + sources=sources, **extra_kwargs) mat = actx.to_numpy(mat) result_mat = mat @ actx.to_numpy(strengths[0]) - _, (blk,) = blk_gen(actx.queue, - targets=targets, - sources=sources, - tgtindices=tgtindices, - srcindices=srcindices, **extra_kwargs) + blk, = blk_gen(actx, + targets=targets, + sources=sources, + tgtindices=tgtindices, + srcindices=srcindices, **extra_kwargs) blk = actx.to_numpy(blk) tgtindices = actx.to_numpy(tgtindices) diff --git a/test/test_misc.py b/test/test_misc.py index 98f07da1..3b5fecfc 100644 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -115,7 +115,7 @@ def test_pde_check_kernels(actx_factory, knl_info, order=5): actx = actx_factory() dim = knl_info.kernel.dim - tctx = t.ToyContext(actx.context, knl_info.kernel, + tctx = t.ToyContext(knl_info.kernel, extra_source_kwargs=knl_info.extra_kwargs) rng = np.random.default_rng(42) @@ -130,7 +130,7 @@ def test_pde_check_kernels(actx_factory, knl_info, order=5): for h in [0.1, 0.05, 0.025]: cp = CalculusPatch(np.array([1, 0, 0])[:dim], h=h, order=order) - pot = pt_src.eval(cp.points) + pot = pt_src.eval(actx, cp.points) pde = knl_info.pde_func(cp, pot) @@ -312,7 +312,7 @@ def test_toy_p2e2e2p(actx_factory, case): from sumpy.expansion import VolumeTaylorExpansionFactory actx = actx_factory() - ctx = t.ToyContext(actx.context, + ctx = t.ToyContext( LaplaceKernel(dim), expansion_factory=VolumeTaylorExpansionFactory(), m2l_use_fft=case.m2l_use_fft) @@ -320,12 +320,12 @@ def test_toy_p2e2e2p(actx_factory, case): errors = [] src_pot = t.PointSources(ctx, src, weights=np.array([1.])) - pot_actual = src_pot.eval(tgt).item() + pot_actual = src_pot.eval(actx, tgt).item() for order in ORDERS_P2E2E2P: - expn = case.expansion1(src_pot, case.center1, order=order) - expn2 = case.expansion2(expn, case.center2, order=order) - pot_p2e2e2p = expn2.eval(tgt).item() + expn = case.expansion1(actx, src_pot, case.center1, order=order) + expn2 = case.expansion2(actx, expn, case.center2, order=order) + pot_p2e2e2p = expn2.eval(actx, tgt).item() errors.append(np.abs(pot_actual - pot_p2e2e2p)) conv_factor = approx_convergence_factor(1 + np.array(ORDERS_P2E2E2P), errors) diff --git a/test/test_qbx.py b/test/test_qbx.py index 3ad439b9..e50e0a08 100644 --- a/test/test_qbx.py +++ b/test/test_qbx.py @@ -63,7 +63,7 @@ def test_direct_qbx_vs_eigval(actx_factory, expn_class, visualize=False): from sumpy.qbx import LayerPotential - lpot = LayerPotential(actx.context, + lpot = LayerPotential( expansion=expn_class(lknl, order), target_kernels=(lknl,), source_kernels=(lknl,)) @@ -94,13 +94,14 @@ def test_direct_qbx_vs_eigval(actx_factory, expn_class, visualize=False): expansion_radii = actx.from_numpy(radius * np.ones(n)) strengths = (actx.from_numpy(sigma * h),) - evt, (result_qbx,) = lpot( - actx.queue, + result_qbx, = lpot( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) result_qbx = actx.to_numpy(result_qbx) - eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx))) + error = np.linalg.norm(result_ref - result_qbx, np.inf) + eocrec.add_data_point(h, error) logger.info("eoc:\n%s", eocrec) @@ -133,10 +134,14 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( from sumpy.qbx import LayerPotential - lpot_dx = LayerPotential(actx.context, expansion=expn_class(lknl, order), - target_kernels=(AxisTargetDerivative(0, lknl),), source_kernels=(lknl,)) - lpot_dy = LayerPotential(actx.context, expansion=expn_class(lknl, order), - target_kernels=(AxisTargetDerivative(1, lknl),), source_kernels=(lknl,)) + lpot_dx = LayerPotential( + expansion=expn_class(lknl, order), + target_kernels=(AxisTargetDerivative(0, lknl),), + source_kernels=(lknl,)) + lpot_dy = LayerPotential( + expansion=expn_class(lknl, order), + target_kernels=(AxisTargetDerivative(1, lknl),), + source_kernels=(lknl,)) mode_nr = 15 @@ -165,12 +170,12 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( expansion_radii = actx.from_numpy(radius * np.ones(n)) strengths = (actx.from_numpy(sigma * h),) - evt, (result_qbx_dx,) = lpot_dx( - actx.queue, + result_qbx_dx, = lpot_dx( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) - evt, (result_qbx_dy,) = lpot_dy( - actx.queue, + result_qbx_dy, = lpot_dy( + actx, targets, sources, centers, strengths, expansion_radii=expansion_radii) @@ -180,7 +185,8 @@ def test_direct_qbx_vs_eigval_with_tgt_deriv( normals = unit_circle result_qbx = normals[0] * result_qbx_dx + normals[1] * result_qbx_dy - eocrec.add_data_point(h, np.max(np.abs(result_ref - result_qbx))) + error = np.linalg.norm(result_ref - result_qbx, np.inf) + eocrec.add_data_point(h, error) if expn_class is not LineTaylorLocalExpansion: logger.info("eoc:\n%s", eocrec) diff --git a/test/test_tools.py b/test/test_tools.py index 09c88b78..035607ab 100644 --- a/test/test_tools.py +++ b/test/test_tools.py @@ -90,7 +90,7 @@ def test_fft(actx_factory, size): out = fft(inp) fft_func = loopy_fft(inp.shape, inverse=False, complex_dtype=inp.dtype.type) - evt, (out_dev,) = fft_func(actx.queue, y=inp_dev) + out_dev = actx.call_loopy(fft_func, y=inp_dev)["x"] assert np.allclose(actx.to_numpy(out_dev), out)