From f3e3b531fe4c0d33d70928b101fb5d445e4174a8 Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Tue, 20 Feb 2024 10:31:39 +0800 Subject: [PATCH 01/68] Update build directory clean up stage for python package pipeline (#19553) Fix to make clean up stage take effect. If the `SourceFolder ` is empty, the task deletes files from the root folder of the repository as though [$(Build.SourcesDirectory)](https://learn.microsoft.com/en-us/azure/devops/pipelines/build/variables) was specified. --- .../component-governance-component-detection-steps.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml index c2ef565a6e9ee..f1418e75bffa2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml @@ -5,10 +5,12 @@ parameters: default: 'succeeded' # could be 'ci_only', 'always', 'succeeded' steps: -- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}: +- ${{ if eq(variables['System.TeamProject'], 'Lotus') }}: - task: DeleteFiles@1 inputs: - contents: $(Build.BinariesDirectory)/* + SourceFolder: '$(Build.BinariesDirectory)' + contents: | + **/* displayName: 'Clean up build directory' - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 From e832562d70685ffeaab7e3bfa20cd5e9aec916a3 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Tue, 20 Feb 2024 09:06:03 +0100 Subject: [PATCH 02/68] Fix invalid usage of designated initializers. (#19497) ### Description I've replaces all ocurances of C++ designated initializers in the CUDA NHWC Tests by member initialization. ### Motivation and Context C++ designated initializers have been introduced in C++ 20. Yet GCC accepts designated initializers in C++17 which is the standard used to compile onnxruntime. Yet MSVC is standard conform and accepts this feature starting C++20 which leads to compile failures on Windows without this change. --- .../test/providers/cuda/nhwc/conv_test.cc | 23 +++++++--- .../cuda/nhwc/conv_transpose_test.cc | 40 +++++++++------- .../providers/cuda/nhwc/nhwc_cuda_helper.h | 6 ++- .../test/providers/cuda/nhwc/norm_test.cc | 7 ++- .../test/providers/cuda/nhwc/pool_test.cc | 46 ++++++++++--------- 5 files changed, 72 insertions(+), 50 deletions(-) diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc index 13d4546d669e3..b6a760f7041ad 100644 --- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc @@ -9,8 +9,8 @@ namespace test { template struct ConvOp { - const std::vector input_dims; - const std::vector kernel_shape; + std::vector input_dims; + std::vector kernel_shape; int64_t channels; int64_t group = 1; bool bias = false; @@ -52,20 +52,31 @@ struct ConvOp { }; TYPED_TEST(CudaNhwcTypedTest, ConvNhwcBias) { - auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .bias = true}; + auto op = ConvOp{}; + op.input_dims = {1, 16, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 16; + op.bias = true; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvNhwcGroupNoBias) { - auto op = ConvOp{.input_dims = {1, 16, 64, 64}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; + auto op = ConvOp{}; + op.input_dims = {1, 16, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 16; + op.group = 4; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvNhwcPadding) { - auto op = - ConvOp{.input_dims = {2, 4, 64, 64}, .kernel_shape = {3, 3}, .channels = 4, .padding = {4, 4, 4, 4}}; + auto op = ConvOp{}; + op.input_dims = {2, 4, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 4; + op.padding = {4, 4, 4, 4}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc index 6514feadf0ff7..786b2cb4cedc4 100644 --- a/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/conv_transpose_test.cc @@ -9,8 +9,8 @@ namespace test { template struct ConvTransposeOp { - const std::vector input_dims; - const std::vector kernel_shape; + std::vector input_dims; + std::vector kernel_shape; int64_t channels; int64_t group = 1; bool bias = false; @@ -60,15 +60,21 @@ struct ConvTransposeOp { }; TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcGroupNoBias) { - auto op = - ConvTransposeOp{.input_dims = {8, 8, 32, 32}, .kernel_shape = {3, 3}, .channels = 16, .group = 4}; + auto op = ConvTransposeOp{}; + op.input_dims = {8, 8, 32, 32}; + op.kernel_shape = {3, 3}; + op.channels = 16; + op.group = 4; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) { - auto op = - ConvTransposeOp{.input_dims = {1, 8, 80, 80}, .kernel_shape = {5, 5}, .channels = 16, .bias = true}; + auto op = ConvTransposeOp{}; + op.input_dims = {1, 8, 80, 80}; + op.kernel_shape = {5, 5}; + op.channels = 16; + op.bias = true; if (HasCudaEnvironment(800)) { MAKE_PROVIDERS_EPS(1e-2) @@ -78,21 +84,23 @@ TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcBias) { } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcPad) { - auto op = ConvTransposeOp{.input_dims = {1, 16, 8, 8}, - .kernel_shape = {3, 3}, - .channels = 32, - .padding = {2, 2, 2, 2}, - .output_padding = {}}; + auto op = ConvTransposeOp{}; + op.input_dims = {1, 16, 8, 8}; + op.kernel_shape = {3, 3}; + op.channels = 32; + op.padding = {2, 2, 2, 2}; + op.output_padding = {}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } TYPED_TEST(CudaNhwcTypedTest, ConvTransposeNhwcOutPad) { - auto op = ConvTransposeOp{.input_dims = {1, 32, 8, 8}, - .kernel_shape = {3, 3}, - .channels = 32, - .strides = {2, 2}, - .output_padding = {1, 1, 1, 1}}; + auto op = ConvTransposeOp{}; + op.input_dims = {1, 32, 8, 8}; + op.kernel_shape = {3, 3}; + op.channels = 32; + op.strides = {2, 2}; + op.output_padding = {1, 1, 1, 1}; MAKE_PROVIDERS_EPS_TYPE(TypeParam) } diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h index 2c942bb790096..82b6a286409cd 100644 --- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h +++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h @@ -16,11 +16,13 @@ #define MAKE_PROVIDERS_EPS(eps) \ std::vector> execution_providers; \ - OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; \ + OrtCUDAProviderOptionsV2 nhwc{}; \ + nhwc.prefer_nhwc = true; \ execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); \ \ double error_tolerance = eps; \ - OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; \ + OrtCUDAProviderOptionsV2 nchw{}; \ + nchw.prefer_nhwc = false; \ auto source_ep = CudaExecutionProviderWithOptions(&nchw); \ auto test = op.get_test(); \ test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); diff --git a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc index 52da8ba557c2d..40f69e3bd5b4f 100644 --- a/onnxruntime/test/providers/cuda/nhwc/norm_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/norm_test.cc @@ -9,7 +9,7 @@ namespace test { template struct BatchNormOp { - const std::vector input_dims; + std::vector input_dims; std::unique_ptr get_test() { // create rand inputs @@ -40,9 +40,8 @@ struct BatchNormOp { }; TYPED_TEST(CudaNhwcTypedTest, BatchNormNhwc) { - auto op = BatchNormOp{ - .input_dims = {4, 16, 64, 64}, - }; + auto op = BatchNormOp{}; + op.input_dims = {4, 16, 64, 64}; MAKE_PROVIDERS() } diff --git a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc index e0d59901da80c..426170b9588f1 100644 --- a/onnxruntime/test/providers/cuda/nhwc/pool_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/pool_test.cc @@ -9,9 +9,9 @@ namespace test { template struct PoolOp { - const std::string pooling_type; - const std::vector input_dims; - const std::vector kernel_shape; + std::string pooling_type; + std::vector input_dims; + std::vector kernel_shape; int64_t channels; int64_t group = 1; std::vector strides = {1, 1}; @@ -41,22 +41,21 @@ struct PoolOp { }; TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwc) { - auto op = PoolOp{ - .pooling_type = "AveragePool", - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - }; + auto op = PoolOp{}; + op.pooling_type = "AveragePool"; + op.input_dims = {1, 16, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 16; + MAKE_PROVIDERS() } TYPED_TEST(CudaNhwcTypedTest, MaxPoolNhwc) { - auto op = PoolOp{ - .pooling_type = "MaxPool", - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - }; + auto op = PoolOp{}; + op.pooling_type = "MaxPool"; + op.input_dims = {1, 16, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 16; MAKE_PROVIDERS() } @@ -72,21 +71,24 @@ TYPED_TEST(CudaNhwcTypedTest, GlobalMaxPoolNhwc) { test->AddOutput("Y", output_dims, output_data); std::vector> execution_providers; - OrtCUDAProviderOptionsV2 nhwc = {.prefer_nhwc = true}; + OrtCUDAProviderOptionsV2 nhwc{}; + nhwc.prefer_nhwc = true; execution_providers.push_back(CudaExecutionProviderWithOptions(&nhwc)); double error_tolerance = 1e-3; - OrtCUDAProviderOptionsV2 nchw = {.prefer_nhwc = false}; + OrtCUDAProviderOptionsV2 nchw{}; + nchw.prefer_nhwc = false; auto source_ep = CudaExecutionProviderWithOptions(&nchw); test->CompareEPs(std::move(source_ep), execution_providers, error_tolerance); } TYPED_TEST(CudaNhwcTypedTest, AveragePoolNhwcPad) { - auto op = PoolOp{.pooling_type = "AveragePool", - .input_dims = {1, 16, 64, 64}, - .kernel_shape = {3, 3}, - .channels = 16, - .padding = {2, 2, 2, 2}}; + auto op = PoolOp{}; + op.pooling_type = "AveragePool"; + op.input_dims = {1, 16, 64, 64}; + op.kernel_shape = {3, 3}; + op.channels = 16; + op.padding = {2, 2, 2, 2}; MAKE_PROVIDERS() } From 7efb0dbe12cf8736d97dcc3b8f41eb96c5c34719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Tue, 20 Feb 2024 17:22:44 +0100 Subject: [PATCH 03/68] add option DefaultTensorType to specify the default tensor type to quantize (#19455) ### Description The current quantization tool relies on shape inference to provide the type of every intermediate tensor, then the tool knows which type it must dequantize into (float32, float16). However, this information is not available if shape inference fails. That happens every time the model include an operator from a custom domain such as com.microsoft. This PR introduces an extra option `DefaultTensorType` as a fall back when the quantizer cannot find the type it needs. ### Motivation and Context This fixes issue #19409. --- .../tools/quantization/onnx_quantizer.py | 25 ++++- .../tools/transformers/quantize_helper.py | 3 +- .../test_quantizer_shape_inference.py | 92 +++++++++++++++++++ 3 files changed, 115 insertions(+), 5 deletions(-) create mode 100644 onnxruntime/test/python/quantization/test_quantizer_shape_inference.py diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py index ecfbaa569ca0a..9450426f12444 100644 --- a/onnxruntime/python/tools/quantization/onnx_quantizer.py +++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py @@ -385,7 +385,7 @@ def add_new_nodes(self, nodes): def quantize_model(self): if self.has_QDQ_nodes(): logging.warning( - "Please check if the model is already quantized." + "Please check if the model is already quantized. " "Note you don't need to quantize a QAT model. OnnxRuntime support to run QAT model directly." ) @@ -442,6 +442,23 @@ def is_valid_quantize_weight(self, weight_name): return False return self.parent.is_valid_quantize_weight(weight_name) + def _get_default_tensor_type(self, tensor_name): + if "DefaultTensorType" in self.extra_options: + logging.info( + "get_tensor_type returns DefaultTensorType for tensor name %r, use %d", + tensor_name, + self.extra_options["DefaultTensorType"], + ) + return self.extra_options["DefaultTensorType"] + raise RuntimeError( + f"Unable to find data type for weight_name={tensor_name!r}. " + f"shape_inference failed to return a type probably this node is " + f"from a different domain or using an input produced by such an operator. " + f"This may happen if you quantize a model already quantized. " + f"You may use extra_options `DefaultTensorType` to indicate " + f"the default weight type, usually `onnx.TensorProto.FLOAT`." + ) + def get_tensor_type(self, tensor_name, mandatory=False): weight = find_by_name(tensor_name, self.model.initializer()) if weight is not None: @@ -450,11 +467,11 @@ def get_tensor_type(self, tensor_name, mandatory=False): vi = self.value_infos[tensor_name] if vi.type.HasField("tensor_type"): if mandatory and vi.type.tensor_type.elem_type == 0: - raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}") + return self._get_default_tensor_type(tensor_name) return vi.type.tensor_type.elem_type if (not self.enable_subgraph_quantization) or (self.parent is None): if mandatory: - raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}") + return self._get_default_tensor_type(tensor_name) return None otype = self.parent.is_valid_quantize_weight(tensor_name) if otype is not None: @@ -464,7 +481,7 @@ def get_tensor_type(self, tensor_name, mandatory=False): if res is not None: return res if mandatory: - raise RuntimeError(f"Unable to find data type for weight_name={tensor_name!r}") + return self._get_default_tensor_type(tensor_name) return None def is_float_tensor(self, tensor_name): diff --git a/onnxruntime/python/tools/transformers/quantize_helper.py b/onnxruntime/python/tools/transformers/quantize_helper.py index a449e881ad361..6a25196dbc24c 100644 --- a/onnxruntime/python/tools/transformers/quantize_helper.py +++ b/onnxruntime/python/tools/transformers/quantize_helper.py @@ -7,7 +7,7 @@ import logging import os -import onnx # noqa: F401 +import onnx import torch from transformers.modeling_utils import Conv1D @@ -69,6 +69,7 @@ def quantize_onnx_model(onnx_model_path, quantized_model_path, use_external_data onnx_model_path, quantized_model_path, use_external_data_format=use_external_data_format, + extra_options={"DefaultTensorType": onnx.TensorProto.FLOAT}, ) logger.info(f"quantized model saved to:{quantized_model_path}") # TODO: inlcude external data in total model size. diff --git a/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py new file mode 100644 index 0000000000000..2b5d1f36070e5 --- /dev/null +++ b/onnxruntime/test/python/quantization/test_quantizer_shape_inference.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest + +import numpy as np +import onnx +import onnx.helper as oh +import onnx.numpy_helper as onh + +from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer +from onnxruntime.quantization.quant_utils import QuantizationMode, QuantType + + +class TestQuantizerShapeInference(unittest.TestCase): + def test_com_microsoft(self): + model = oh.make_model( + oh.make_graph( + [ + oh.make_node("MatMul", ["X", "W1"], ["T1"]), + oh.make_node("FusedMatMul", ["T1", "W2"], ["T2"], domain="com.microsoft"), + oh.make_node("MatMul", ["T2", "W3"], ["T3"]), + oh.make_node("MatMul", ["T3", "W4"], ["Y"]), + ], + "name", + [oh.make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 4])], + [oh.make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [1, 4])], + [ + onh.from_array(np.random.randn(4, 4).astype(np.float32), "W1"), + onh.from_array(np.random.randn(4, 4).astype(np.float32), "W2"), + onh.from_array(np.random.randn(4, 4).astype(np.float32), "W3"), + onh.from_array(np.random.randn(4, 4).astype(np.float32), "W4"), + ], + ), + opset_imports=[oh.make_opsetid("", 18), oh.make_opsetid("com.microsoft", 1)], + ) + model_shaped = onnx.shape_inference.infer_shapes(model) + shaped_results = set(t.name for t in model_shaped.graph.value_info) + # every result after T1 depends on T2 coming from a node com.microsoft, + # shape_inference cannot go beyond this point + self.assertEqual(shaped_results, {"T1"}) + + # first try: checks it raises an exception + quantizer = ONNXQuantizer( + model, + False, # per_channel + False, # reduce_range + QuantizationMode.IntegerOps, # mode + False, # static + QuantType.QInt8, # weight_type, + QuantType.QUInt8, # dynamic activation only supports uint8 + None, + [], # nodes_to_quantize, + [], # nodes_to_exclude + ["MatMul"], # op_types_to_quantize, + {"MatMulConstBOnly": True}, # extra_options, + # {'DefaultTensorType': 1, } + ) + + with self.assertRaises(RuntimeError) as e: + quantizer.quantize_model() + self.assertIn("Unable to find data type for weight_name=", str(e)) + + # second try: checks it works + quantizer = ONNXQuantizer( + model, + False, # per_channel + False, # reduce_range + QuantizationMode.IntegerOps, # mode + False, # static + QuantType.QInt8, # weight_type, + QuantType.QUInt8, # dynamic activation only supports uint8 + None, + [], # nodes_to_quantize, + [], # nodes_to_exclude + ["MatMul"], # op_types_to_quantize, + { + "MatMulConstBOnly": True, + "DefaultTensorType": 1, + }, + ) + + model = quantizer.quantize_model() + ops = {n.op_type for n in model.graph.node} + self.assertEqual(ops, {"Cast", "FusedMatMul", "MatMulInteger", "DynamicQuantizeLinear", "Mul"}) + + +if __name__ == "__main__": + unittest.main(verbosity=2) From 1b48054e1b7991ccef664fbedd659ec95d0e7ca7 Mon Sep 17 00:00:00 2001 From: Jiajie Hu Date: Wed, 21 Feb 2024 01:24:34 +0800 Subject: [PATCH 04/68] [js/webgpu] Create Split indices helpers by rank, not by shape (#19554) ### Description This is required to make shape uniforms really work. ### Motivation and Context The bug was unveiled in a model with multiple Split nodes. The later nodes would try to reuse a previous pipeline cache, while the old shapes were hardcoded as constants in cache. --- js/web/lib/wasm/jsep/webgpu/ops/split.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/split.ts b/js/web/lib/wasm/jsep/webgpu/ops/split.ts index 14d6f37927590..a09ac78b17006 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/split.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/split.ts @@ -68,7 +68,7 @@ const createSplitProgramInfo = (inputs: readonly TensorView[], attributes: Split const dataType = inputs[0].dataType; const axis = ShapeUtil.normalizeAxis(attributes.axis, inputShape.length); const outputs = new Array(attributes.numOutputs); - const input = inputVariable('input', dataType, inputShape); + const input = inputVariable('input', dataType, inputShape.length); const sizeInSplitAxis = new Array(attributes.numOutputs); const outputsTensorInfo: TensorInfo[] = []; const outputShapes: number[][] = []; @@ -80,7 +80,7 @@ const createSplitProgramInfo = (inputs: readonly TensorView[], attributes: Split const outputShape = inputShape.slice(); outputShape[attributes.axis] = attributes.splitSizes[i]; outputShapes.push(outputShape); - outputs[i] = outputVariable(`output${i}`, dataType, outputShape); + outputs[i] = outputVariable(`output${i}`, dataType, outputShape.length); outputsTensorInfo.push({dims: outputShapes[i], dataType: inputs[0].dataType}); } programUniforms.push( From 3c49aacd5667b320a4e02626a176098f7423d7c0 Mon Sep 17 00:00:00 2001 From: Sheil Kumar Date: Tue, 20 Feb 2024 13:13:40 -0800 Subject: [PATCH 05/68] Disable __cpuid check on arm64 builds as intrinsic is not available (#19574) Disable __cpuid check on arm64 builds as intrinsic is not available Motivation Breaking the arm64 build. Co-authored-by: Sheil Kumar --- winml/lib/Api/HardwareCoreEnumerator.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index fa069c7fb66a7..b6b44690f4f6c 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -84,6 +84,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. auto cores = GetNumberOPhysicalAndEngineeringCores(); +#if !defined(_M_ARM64) && !defined(__aarch64__) const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" int regs_leaf0[4]; int regs_leaf7[4]; @@ -100,6 +101,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // On Intel Hybrid processors, numSocCores == cores.Num2CacheCores return cores.PhysicalCores - cores.Num2CacheCores; } +#endif return cores.PhysicalCores; } From ec9c8cbdc9686ccda6553674d6aab61cfd245cf0 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 21 Feb 2024 07:40:35 +1000 Subject: [PATCH 06/68] Use xcode parallel build flags to speed up iOS CI that is timing out (#19570) ### Description Provide specific xcodebuild flags instead of depending on cmake to do the right thing. This built in just over an hour with a ccache miss. Previous CIs with a ccache miss were timing out after 150 minutes. ### Motivation and Context --- tools/ci_build/build.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 244bebd81474d..5b715bb29e5a1 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1631,9 +1631,11 @@ def generate_build_tree( [ *temp_cmake_args, f"-DCMAKE_BUILD_TYPE={config}", - f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed" - if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm) - else "", + ( + f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed" + if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm) + else "" + ), ], cwd=config_build_dir, cuda_home=cuda_home, @@ -1667,8 +1669,11 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe f"/p:CL_MPCount={num_parallel_jobs}", ] elif args.cmake_generator == "Xcode": - # CMake will generate correct build tool args for Xcode - cmd_args += ["--parallel", str(num_parallel_jobs)] + build_tool_args += [ + "-parallelizeTargets", + "-jobs", + str(num_parallel_jobs), + ] else: build_tool_args += [f"-j{num_parallel_jobs}"] From 7a5860e4909387448cb51351d3af50933238ba10 Mon Sep 17 00:00:00 2001 From: Jake Mathern Date: Tue, 20 Feb 2024 13:41:40 -0800 Subject: [PATCH 07/68] Fix cmake function duplicate lib (#19547) ### Description Fixes cmake function definition in winml.cmake to copy link flags. ### Motivation and Context XFGCheck errors in WindowsAI because this function does not transfer linker flags --- cmake/winml.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/winml.cmake b/cmake/winml.cmake index 268ee3960e75a..57cecd3e66adb 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -827,6 +827,7 @@ if (winml_is_inbox) get_target_property(compile_options ${target} COMPILE_OPTIONS) get_target_property(include_directories ${target} INCLUDE_DIRECTORIES) get_target_property(link_libraries ${target} LINK_LIBRARIES) + get_target_property(link_flags ${target} LINK_FLAGS) get_target_property(link_options ${target} LINK_OPTIONS) add_library(${new_target} SHARED ${sources}) @@ -835,6 +836,7 @@ if (winml_is_inbox) target_compile_options(${new_target} PRIVATE ${compile_options}) target_include_directories(${new_target} PRIVATE ${include_directories}) target_link_libraries(${new_target} PRIVATE ${link_libraries}) + set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}") target_link_options(${new_target} PRIVATE ${link_options}) endfunction() From 97ff17c2cbb6ee6f27c052e9c4302c70a41af485 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:02:11 -0800 Subject: [PATCH 08/68] update script of run CI for external PRs to add "Big Models" (#19576) ### Description update script of run CI for external PRs to add "Big Models" --- tools/python/run_CIs_for_external_pr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py index 7a77839c4a4e7..df4e70b1e51fe 100644 --- a/tools/python/run_CIs_for_external_pr.py +++ b/tools/python/run_CIs_for_external_pr.py @@ -93,6 +93,8 @@ def main(): # checks "onnxruntime-python-checks-ci-pipeline", "onnxruntime-binary-size-checks-ci-pipeline", + # big models + "Big Models", # not currently required, but running ensures we're hitting all mobile platforms "Android CI Pipeline", "iOS CI Pipeline", From 3fe2c137ee5923ee369062453d528fe0e33bf4bc Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:23:01 -0800 Subject: [PATCH 09/68] [js] small fix to workaround formatter (#19400) ### Description Rename shader variable names to snake_case naming and also to avoid formatter behaving inconsistently in win/linux. --- js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts index 3f73d9cb7c5bc..d5f97213e49ce 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts @@ -85,28 +85,28 @@ const createLayerNormProgramInfo = ${shaderHelper.mainStart()} ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')} let offset = global_idx * uniforms.norm_size_vectorized; - var meanVector = ${fillVector('f32', components)}; - var meanSquareVector = ${fillVector('f32', components)}; + var mean_vector = ${fillVector('f32', components)}; + var mean_square_vector = ${fillVector('f32', components)}; for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) { let value = ${castToF32(dataType, components, 'x[h + offset]')}; - meanVector += value; - meanSquareVector += value * value; + mean_vector += value; + mean_square_vector += value * value; } - let mean = ${sumVector('meanVector', components)} / uniforms.norm_size; - let invStdDev = - inverseSqrt(${sumVector('meanSquareVector', components)} / uniforms.norm_size - mean * mean + uniforms.epsilon); + let mean = ${sumVector('mean_vector', components)} / uniforms.norm_size; + let inv_std_dev = inverseSqrt(${ + sumVector('mean_square_vector', components)} / uniforms.norm_size - mean * mean + uniforms.epsilon); for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) { let f32input = ${castToF32(dataType, components, 'x[j + offset]')}; let f32scale = ${castToF32(dataType, components, 'scale[j]')}; - output[j + offset] = ${variables[0].type.value}((f32input - mean) * invStdDev * f32scale + output[j + offset] = ${variables[0].type.value}((f32input - mean) * inv_std_dev * f32scale ${bias ? `+ ${castToF32(dataType, components, 'bias[j]')}` : ''} ); } ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''}; - ${hasInvStdOutput ? 'inv_std_output[global_idx] = invStdDev' : ''}; + ${hasInvStdOutput ? 'inv_std_output[global_idx] = inv_std_dev' : ''}; }`; }; const outputs = [{dims: outputShape, dataType: inputs[0].dataType}]; From 70567a4b3a8bc74fb0f1a9ed9ea5a5be6b99b378 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:33:21 -0800 Subject: [PATCH 10/68] [js/web] use ApiTensor insteadof onnxjs Tensor in TensorResultValidator (#19358) ### Description use ApiTensor insteadof onnxjs Tensor in TensorResultValidator. Make test runner less depend on onnxjs classes. --- js/web/test/test-runner.ts | 26 +++++++------------ .../unittests/backends/webgl/test-conv-new.ts | 4 ++- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index b01d474788f25..ecc7d4b4a09a5 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -39,10 +39,6 @@ const ONNXRUNTIME_THRESHOLD_RELATIVE_ERROR = 1.00001; */ const now = (typeof performance !== 'undefined' && performance.now) ? () => performance.now() : Date.now; -function toInternalTensor(tensor: ort.Tensor): Tensor { - return new Tensor( - tensor.dims, tensor.type as Tensor.DataType, undefined, undefined, tensor.data as Tensor.NumberType); -} function fromInternalTensor(tensor: Tensor): ort.Tensor { return new ort.Tensor(tensor.type, tensor.data as ort.Tensor.DataType, tensor.dims); } @@ -330,6 +326,10 @@ export class TensorResultValidator { } checkTensorResult(actual: Tensor[], expected: Tensor[]): void { + this.checkApiTensorResult(actual.map(fromInternalTensor), expected.map(fromInternalTensor)); + } + + checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void { // check output size expect(actual.length, 'size of output tensors').to.equal(expected.length); @@ -347,10 +347,6 @@ export class TensorResultValidator { } } - checkApiTensorResult(actual: ort.Tensor[], expected: ort.Tensor[]): void { - this.checkTensorResult(actual.map(toInternalTensor), expected.map(toInternalTensor)); - } - checkNamedTensorResult(actual: Record, expected: Test.NamedTensor[]): void { // check output size expect(Object.getOwnPropertyNames(actual).length, 'size of output tensors').to.equal(expected.length); @@ -364,7 +360,7 @@ export class TensorResultValidator { } // This function check whether 2 tensors should be considered as 'match' or not - areEqual(actual: Tensor, expected: Tensor): boolean { + areEqual(actual: ort.Tensor, expected: ort.Tensor): boolean { if (!actual || !expected) { return false; } @@ -392,13 +388,13 @@ export class TensorResultValidator { switch (actualType) { case 'string': - return this.strictEqual(actual.stringData, expected.stringData); + return this.strictEqual(actual.data, expected.data); case 'float32': case 'float64': return this.floatEqual( - actual.numberData as number[] | Float32Array | Float64Array, - expected.numberData as number[] | Float32Array | Float64Array); + actual.data as number[] | Float32Array | Float64Array, + expected.data as number[] | Float32Array | Float64Array); case 'uint8': case 'int8': @@ -409,10 +405,8 @@ export class TensorResultValidator { case 'int64': case 'bool': return TensorResultValidator.integerEqual( - actual.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | - Int32Array, - expected.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | - Int32Array); + actual.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array, + expected.data as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array | Int32Array); default: throw new Error('type not implemented or not supported'); diff --git a/js/web/test/unittests/backends/webgl/test-conv-new.ts b/js/web/test/unittests/backends/webgl/test-conv-new.ts index 8c186b9b36451..014fc57f21558 100644 --- a/js/web/test/unittests/backends/webgl/test-conv-new.ts +++ b/js/web/test/unittests/backends/webgl/test-conv-new.ts @@ -893,7 +893,9 @@ describe('New Conv tests', () => { const expected = cpuConv( inputTensor, kernelTensor, biasTensor, testData.autoPad, testData.dilations, testData.pads, testData.strides); - if (!validator.areEqual(actual, expected)) { + try { + validator.checkTensorResult([actual], [expected]); + } catch { console.log(actual.dims, `[${actual.numberData.slice(0, 20).join(',')},...]`); console.log(expected.dims, `[${expected.numberData.slice(0, 20).join(',')},...]`); throw new Error('Expected and Actual did not match'); From 6e04e36e3faf2d8115c0962c85b86a6a8b48ac5b Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 20 Feb 2024 17:33:37 -0800 Subject: [PATCH 11/68] [js/common] upgrade tsc in common from 4.9.5 to 5.2.2 (#19317) ### Description upgrade tsc in common from 4.9.5 to 5.2.2 --- js/common/package-lock.json | 106 +++++++++++++++++------------------ js/common/package.json | 4 +- js/common/test/tsconfig.json | 2 +- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/js/common/package-lock.json b/js/common/package-lock.json index a5ada877b916a..3988ac80707e0 100644 --- a/js/common/package-lock.json +++ b/js/common/package-lock.json @@ -9,13 +9,13 @@ "version": "1.18.0", "license": "MIT", "devDependencies": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" } }, "node_modules/ansi-sequence-parser": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz", - "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz", + "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==", "dev": true }, "node_modules/balanced-match": { @@ -34,9 +34,9 @@ } }, "node_modules/jsonc-parser": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", - "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz", + "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==", "dev": true }, "node_modules/lunr": { @@ -46,9 +46,9 @@ "dev": true }, "node_modules/marked": { - "version": "4.2.12", - "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz", - "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==", + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", + "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", "dev": true, "bin": { "marked": "bin/marked.js" @@ -58,24 +58,24 @@ } }, "node_modules/minimatch": { - "version": "7.4.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz", - "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==", + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz", + "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==", "dev": true, "dependencies": { "brace-expansion": "^2.0.1" }, "engines": { - "node": ">=10" + "node": ">=16 || 14 >=14.17" }, "funding": { "url": "https://github.com/sponsors/isaacs" } }, "node_modules/shiki": { - "version": "0.14.1", - "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz", - "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==", + "version": "0.14.7", + "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz", + "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==", "dev": true, "dependencies": { "ansi-sequence-parser": "^1.1.0", @@ -85,30 +85,30 @@ } }, "node_modules/typedoc": { - "version": "0.23.26", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz", - "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==", + "version": "0.25.7", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz", + "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==", "dev": true, "dependencies": { "lunr": "^2.3.9", - "marked": "^4.2.12", - "minimatch": "^7.1.3", - "shiki": "^0.14.1" + "marked": "^4.3.0", + "minimatch": "^9.0.3", + "shiki": "^0.14.7" }, "bin": { "typedoc": "bin/typedoc" }, "engines": { - "node": ">= 14.14" + "node": ">= 16" }, "peerDependencies": { - "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x" + "typescript": "4.6.x || 4.7.x || 4.8.x || 4.9.x || 5.0.x || 5.1.x || 5.2.x || 5.3.x" } }, "node_modules/typescript": { - "version": "4.9.5", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz", - "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz", + "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==", "dev": true, "peer": true, "bin": { @@ -116,7 +116,7 @@ "tsserver": "bin/tsserver" }, "engines": { - "node": ">=4.2.0" + "node": ">=14.17" } }, "node_modules/vscode-oniguruma": { @@ -134,9 +134,9 @@ }, "dependencies": { "ansi-sequence-parser": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.0.tgz", - "integrity": "sha512-lEm8mt52to2fT8GhciPCGeCXACSz2UwIN4X2e2LJSnZ5uAbn2/dsYdOmUXq0AtWS5cpAupysIneExOgH0Vd2TQ==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ansi-sequence-parser/-/ansi-sequence-parser-1.1.1.tgz", + "integrity": "sha512-vJXt3yiaUL4UU546s3rPXlsry/RnM730G1+HkpKE012AN0sx1eOrxSu95oKDIonskeLTijMgqWZ3uDEe3NFvyg==", "dev": true }, "balanced-match": { @@ -155,9 +155,9 @@ } }, "jsonc-parser": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", - "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==", + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.1.tgz", + "integrity": "sha512-AilxAyFOAcK5wA1+LeaySVBrHsGQvUFCDWXKpZjzaL0PqW+xfBOttn8GNtWKFWqneyMZj41MWF9Kl6iPWLwgOA==", "dev": true }, "lunr": { @@ -167,24 +167,24 @@ "dev": true }, "marked": { - "version": "4.2.12", - "resolved": "https://registry.npmjs.org/marked/-/marked-4.2.12.tgz", - "integrity": "sha512-yr8hSKa3Fv4D3jdZmtMMPghgVt6TWbk86WQaWhDloQjRSQhMMYCAro7jP7VDJrjjdV8pxVxMssXS8B8Y5DZ5aw==", + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", + "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", "dev": true }, "minimatch": { - "version": "7.4.2", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-7.4.2.tgz", - "integrity": "sha512-xy4q7wou3vUoC9k1xGTXc+awNdGaGVHtFUaey8tiX4H1QRc04DZ/rmDFwNm2EBsuYEhAZ6SgMmYf3InGY6OauA==", + "version": "9.0.3", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.3.tgz", + "integrity": "sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg==", "dev": true, "requires": { "brace-expansion": "^2.0.1" } }, "shiki": { - "version": "0.14.1", - "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.1.tgz", - "integrity": "sha512-+Jz4nBkCBe0mEDqo1eKRcCdjRtrCjozmcbTUjbPTX7OOJfEbTZzlUWlZtGe3Gb5oV1/jnojhG//YZc3rs9zSEw==", + "version": "0.14.7", + "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.14.7.tgz", + "integrity": "sha512-dNPAPrxSc87ua2sKJ3H5dQ/6ZaY8RNnaAqK+t0eG7p0Soi2ydiqbGOTaZCqaYvA/uZYfS1LJnemt3Q+mSfcPCg==", "dev": true, "requires": { "ansi-sequence-parser": "^1.1.0", @@ -194,21 +194,21 @@ } }, "typedoc": { - "version": "0.23.26", - "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.23.26.tgz", - "integrity": "sha512-5m4KwR5tOLnk0OtMaRn9IdbeRM32uPemN9kur7YK9wFqx8U0CYrvO9aVq6ysdZSV1c824BTm+BuQl2Ze/k1HtA==", + "version": "0.25.7", + "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.25.7.tgz", + "integrity": "sha512-m6A6JjQRg39p2ZVRIN3NKXgrN8vzlHhOS+r9ymUYtcUP/TIQPvWSq7YgE5ZjASfv5Vd5BW5xrir6Gm2XNNcOow==", "dev": true, "requires": { "lunr": "^2.3.9", - "marked": "^4.2.12", - "minimatch": "^7.1.3", - "shiki": "^0.14.1" + "marked": "^4.3.0", + "minimatch": "^9.0.3", + "shiki": "^0.14.7" } }, "typescript": { - "version": "4.9.5", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.5.tgz", - "integrity": "sha512-1FXk9E2Hm+QzZQ7z+McJiHL4NW1F2EzMu9Nq9i3zAaGqibafqYwCVU6WyWAuyQRRzOlxou8xZSyXLEN8oKj24g==", + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.2.2.tgz", + "integrity": "sha512-mI4WrpHsbCIcwT9cF4FZvr80QUeKvsUsUvKDoR+X/7XHQH98xYD8YHZg7ANtz2GtZt/CBq2QJ0thkGJMHfqc1w==", "dev": true, "peer": true }, diff --git a/js/common/package.json b/js/common/package.json index 64ab2736adbe3..cd2612aab4984 100644 --- a/js/common/package.json +++ b/js/common/package.json @@ -9,7 +9,7 @@ }, "author": "fs-eire", "scripts": { - "build:cjs": "tsc --module commonjs --outDir ./dist/cjs", + "build:cjs": "tsc --module commonjs --moduleResolution node10 --outDir ./dist/cjs", "build:esm": "tsc", "build:bundles": "webpack", "build": "node ./build.js", @@ -18,7 +18,7 @@ "test": "mocha ./test/**/*.js --timeout 30000" }, "devDependencies": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" }, "main": "dist/cjs/index.js", "exports": { diff --git a/js/common/test/tsconfig.json b/js/common/test/tsconfig.json index 2e4927ac3b325..e9068ad837a81 100644 --- a/js/common/test/tsconfig.json +++ b/js/common/test/tsconfig.json @@ -2,7 +2,7 @@ "extends": "../../tsconfig.tools.json", "exclude": ["type-tests/**/*.ts"], "compilerOptions": { - "module": "ES2022", + "module": "Node16", "sourceMap": true } } From 45e20bf7810689ecf385957c34434c6d2456e32b Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 21 Feb 2024 12:38:37 +1000 Subject: [PATCH 12/68] Use build.py to build in py-win-gpu.yml so parallelization parameters are set (#19578) ### Description build.py sets a few parallelization parameters when building. Using msbuild directly lacks those. https://github.com/microsoft/onnxruntime/blob/7a5860e4909387448cb51351d3af50933238ba10/tools/ci_build/build.py#L1665-L1669 Changed to use build.py. If there's a concern with that we _could_ set the parameters in the yaml, but that will be uglier due to duplicating logic in multiple places. ### Motivation and Context --- .../azure-pipelines/templates/py-win-gpu.yml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml index 18368e59cad52..4315eae503ebd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml @@ -120,17 +120,17 @@ jobs: $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} workingDirectory: '$(Build.BinariesDirectory)' - - task: VSBuild@1 + # building with build.py so the parallelization parameters are added to the msbuild command + - task: PythonScript@0 displayName: 'Build' inputs: - solution: '$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln' - platform: x64 - configuration: RelWithDebInfo - msbuildArchitecture: $(buildArch) - maximumCpuCount: true - logProjectEvents: true - workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo' - createLogFile: true + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + --config RelWithDebInfo + --build_dir $(Build.BinariesDirectory) + --parallel --build + $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} + workingDirectory: '$(Build.BinariesDirectory)' # Esrp signing - template: win-esrp-dll.yml @@ -188,7 +188,7 @@ jobs: condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) inputs: GdnPublishTsaOnboard: false - GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' + GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' - template: component-governance-component-detection-steps.yml parameters: From 0c4421cb7867434e1e08b4274f16f6c2f14cb4ce Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Wed, 21 Feb 2024 03:39:43 +0100 Subject: [PATCH 13/68] Fix compile warnings (as errors) for functions which miss returning required return value (#19079) Added dummy return values to functions which specify a return value, but do not return an value value. ### Motivation and Context Fix compiler errors with 'warnings as errors' enabled. From 8fadc6c913bc30edff2e89756da515b9bd75d256 Mon Sep 17 00:00:00 2001 From: zhijiang <43435212+zhijxu-MS@users.noreply.github.com> Date: Wed, 21 Feb 2024 10:41:42 +0800 Subject: [PATCH 14/68] Zhijxu/cleanup cached tensors when oom (#19306) in pytorch, when oom happens at bp, user could decrease the batch size and rerun it without restarting the process. while in ORT, the intermediate tensors are kept even OOM, so decrease batch size still fail. this is torch run, we can see after oom failure, torch will release tensor before next step ![image](https://github.com/microsoft/onnxruntime/assets/43435212/92b8a2e3-454b-448a-a223-17cb91d463c2) this is from ort, we can see ort not release its tensors after OOM failure. ![image](https://github.com/microsoft/onnxruntime/assets/43435212/bb6a3882-8e14-4f37-8079-e7f70fc2546b) ort with the PR, we can see memory is released, **the 4GB memory is not own by ort, and will be released by torch at the end**. ![image](https://github.com/microsoft/onnxruntime/assets/43435212/7f39d711-4e36-47d5-aecf-3805433a6d01) --- onnxruntime/core/framework/execution_frame.cc | 21 +++++++++++++++ onnxruntime/core/framework/execution_frame.h | 2 ++ .../training/ortmodule/_training_manager.py | 26 ++++++++++--------- 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index 8c08152986cf6..32a5f749af084 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -204,6 +204,14 @@ AllocatorPtr IExecutionFrame::GetAllocator(const OrtDevice& info) const { Status IExecutionFrame::ReleaseMLValue(int ort_value_idx) { return ReleaseMLValueImpl(ort_value_idx); } +#ifdef ENABLE_TRAINING +void IExecutionFrame::ReleaseAllMLValues() { + for (size_t ort_value_idx = 0; ort_value_idx < all_values_.size(); ort_value_idx++) { + all_values_[ort_value_idx] = OrtValue(); + } +} +#endif + Status IExecutionFrame::ReleaseMLValueImpl(int ort_value_idx) { if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast(ort_value_idx) >= all_values_size_) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "invalid index ", ort_value_idx); @@ -831,7 +839,20 @@ AllocatorPtr ExecutionFrame::GetAllocatorImpl(const OrtDevice& info) const { // This method is not thread safe! // Return S_OK and nullptr if index map to a value that is an unused optional input/output Status ExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value, int ort_value_idx, const TensorShape* shape) { +#ifdef ENABLE_TRAINING + try { + auto status = AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape); + return status; + } catch (const std::exception& e) { + LOGS(session_state_.Logger(), WARNING) + << "Exception caught when allocating memory for ort_value with index: " << ort_value_idx + << "so clean up all OrtValues"; + ReleaseAllMLValues(); + return Status(ONNXRUNTIME, FAIL, e.what()); + } +#else return AllocateAsPerAllocationPlan(ort_value, ort_value_idx, shape); +#endif } void ExecutionFrame::VerifyOutputSizes(int output_index, const Node& node, const TensorShape& output_shape) { diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h index 1576c16684faa..18d210ffd48f7 100644 --- a/onnxruntime/core/framework/execution_frame.h +++ b/onnxruntime/core/framework/execution_frame.h @@ -67,6 +67,8 @@ class IExecutionFrame { const std::unordered_map& initializers); Status GetOutputs(gsl::span fetch_mlvalue_idxs, std::vector& fetches); + // if OOM happens, then release all values, so session can run next batch. + void ReleaseAllMLValues(); #endif // TO DO: make it thread safe diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py index cc533e549db92..73c32a2f51e41 100644 --- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py @@ -196,18 +196,20 @@ def backward(ctx, *grad_outputs): # Run and get results backward_outputs = C.OrtValueVector() - self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state) - # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not - # affect peak memory usage in a subsequent graph run. - del ctx.run_info.state - - # Fast version: all backward_outputs are converted first. - # This version only works if backward_outputs is an OrtValueVector. - transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device) - - self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD) - - return tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map) + try: + self._execution_agent.run_backward(backward_inputs, backward_outputs, ctx.run_info.state) + # Destroy the state immediately (as opposed to be at the mercy of garbage collector) so it does not + # affect peak memory usage in a subsequent graph run. + + # Fast version: all backward_outputs are converted first. + # This version only works if backward_outputs is an OrtValueVector. + transferred_backward_outputs = _utils._ortvalues_to_torch_tensor(backward_outputs, self._device) + + self._runtime_inspector.memory_ob.inspect_memory(Phase.POST_BACKWARD) + res = tuple(transferred_backward_outputs[idx] if idx != -1 else None for idx in self._gradient_map) + return res + finally: + del ctx.run_info.state return _ORTModuleFunction From 6226c5f62f3d16b9702d5c40993ee9bf1cbd119c Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:08:48 +0800 Subject: [PATCH 15/68] [ROCm] Add SkipGroupNorm for ROCm EP (#19303) Add SkipGroupNorm for ROCm EP. --------- Co-authored-by: Peixuan Zuo --- cmake/onnxruntime_rocm_hipify.cmake | 5 - .../contrib_ops/rocm/diffusion/group_norm.cc | 152 ------------- .../rocm/diffusion/group_norm_ck.cuh | 35 +-- .../diffusion/group_norm_ck_impl/impl.cuh | 10 +- .../diffusion/group_norm_ck_impl/impl_fp16.cu | 8 +- .../diffusion/group_norm_ck_impl/impl_fp32.cu | 8 +- .../rocm/diffusion/group_norm_common.h | 125 +++------- .../rocm/diffusion/group_norm_impl.cu | 47 ++-- .../rocm/diffusion/group_norm_impl.h | 47 ---- .../rocm/diffusion/group_norm_impl_kernel.cuh | 213 ------------------ .../rocm/diffusion/group_norm_triton.cuh | 29 +-- .../rocm/diffusion/group_norm_triton.py | 16 +- .../rocm/diffusion/group_norm_tunable_op.h | 153 +++++++------ .../contrib_ops/rocm/rocm_contrib_kernels.cc | 2 + .../kernel_explorer/kernels/groupnorm_test.py | 136 ++++++++--- .../kernels/rocm/group_norm.cu | 112 +++++---- .../contrib_ops/skip_group_norm_op_test.cc | 14 +- tools/ci_build/amd_hipify.py | 2 + 18 files changed, 382 insertions(+), 732 deletions(-) delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h delete mode 100644 onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index d485abe6bb1a6..85a9bf50460d3 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -44,12 +44,7 @@ set(contrib_ops_excluded_files "bert/packed_multihead_attention.cc" "bert/packed_multihead_attention_impl.h" "bert/packed_multihead_attention_impl.cu" - "diffusion/group_norm.cc" "diffusion/group_norm_impl.cu" - "diffusion/group_norm_impl.h" - "diffusion/group_norm_impl_kernel.cuh" - "diffusion/group_norm_common_base.h" - "diffusion/group_norm_common_base.cc" "diffusion/nhwc_conv.cc" "math/gemm_float8.cc" "math/gemm_float8.cu" diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc b/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc deleted file mode 100644 index e82e15a304f4c..0000000000000 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm.cc +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/providers/rocm/rocm_common.h" -#include "contrib_ops/rocm/diffusion/group_norm.h" -#include "contrib_ops/rocm/diffusion/group_norm_impl.h" - -namespace onnxruntime { -namespace contrib { -namespace rocm { - -#define GROUP_NORM_TYPES float, MLFloat16 - -ONNX_OPERATOR_KERNEL_EX( - GroupNorm, kMSDomain, 1, kRocmExecutionProvider, - (*KernelDefBuilder::Create()).TypeConstraint("T", BuildKernelDefConstraints()), GroupNorm); - -using namespace ONNX_NAMESPACE; - -namespace { -template -struct DispatchGroupNorm { - Status operator()(RocmTuningContext* tuning_ctx, - Stream* stream, - Tensor* output, - const Tensor* input, - const Tensor* gamma, - const Tensor* beta, - void* workspace, - float epsilon, - int batch_size, - int num_channels, - int height, - int width, - int num_groups, - bool use_swish_activation) { - typedef typename ToHipType::MappedType HipT; - return LaunchGroupNormKernel( - tuning_ctx, - stream, - reinterpret_cast(output->MutableData()), - reinterpret_cast(input->Data()), - gamma->Data(), - beta->Data(), - workspace, - epsilon, - batch_size, - num_channels, - height, - width, - num_groups, - use_swish_activation); - } -}; - -} // namespace - -GroupNorm::GroupNorm(const OpKernelInfo& op_info) : RocmKernel(op_info) { - epsilon_ = op_info.GetAttrOrDefault("epsilon", 1e-5f); - ORT_ENFORCE(epsilon_ >= 0); - - int64_t num_groups; - ORT_ENFORCE(op_info.GetAttr("groups", &num_groups).IsOK()); - ORT_ENFORCE(num_groups >= 0); - num_groups_ = static_cast(num_groups); - - int64_t activation; - ORT_ENFORCE(op_info.GetAttr("activation", &activation).IsOK()); - ORT_ENFORCE(activation == 0 || activation == 1); // 0 is None, 1 is Swish - use_swish_activation_ = (activation == 1); - - channels_last_ = (op_info.GetAttrOrDefault("channels_last", static_cast(1)) != 0); -} - -Status GroupNorm::PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/, - bool& is_packed, PrePackedWeights* /*prepacked_weights*/) { - is_packed = false; - return Status::OK(); -} - -Status GroupNorm::ComputeInternal(OpKernelContext* context) const { - const Tensor* input = context->Input(0); - const Tensor* gamma = context->Input(1); - const Tensor* beta = context->Input(2); - Tensor* output = context->Output(0, input->Shape()); - - if (!channels_last_) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "only the channels_last layout is supported"); - } - - const auto& input_dims = input->Shape().GetDims(); - if (input_dims.size() != 4) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "input is expected to have 4 dimensions, got ", input_dims.size()); - } - - const auto& gamma_dims = gamma->Shape().GetDims(); - if (gamma_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "gamma is expected to have 1 dimension, got ", gamma_dims.size()); - } - if (gamma_dims[0] != input_dims[3]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Number of channels in gamma and input does not match"); - } - - const auto& beta_dims = beta->Shape().GetDims(); - if (beta_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "beta is expected to have 1 dimension, got ", beta_dims.size()); - } - if (beta_dims[0] != input_dims[3]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Number of channels in beta and input does not match"); - } - - // Input and output format is NHWC - int batch_size = static_cast(input_dims[0]); - int num_channels = static_cast(input_dims[3]); - int height = static_cast(input_dims[1]); - int width = static_cast(input_dims[2]); - - if (num_channels % num_groups_ != 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "number of channels should be divisible by num_groups"); - } - - if (context->GetUseDeterministicCompute()) { - static std::once_flag log_warning; - std::call_once(log_warning, []() { - LOGS_DEFAULT(WARNING) << "GroupNorm has no deterministic GPU kernel, its outputs may still be nondeterministic."; - }); - } - - auto workspace = GetScratchBuffer(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream()); - - utils::MLTypeCallDispatcher dispatcher(input->GetElementType()); - return dispatcher.InvokeRet(GetTuningContext(), context->GetComputeStream(), - output, input, gamma, beta, workspace.get(), - epsilon_, - batch_size, - num_channels, - height, - width, - num_groups_, - use_swish_activation_); -} - -} // namespace rocm -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh index fb7091592c16e..d0a0d09fcbae3 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh @@ -26,13 +26,18 @@ namespace rocm { using onnxruntime::rocm::CKDataTypeAdaptor; -using Swish = ck::tensor_operation::element_wise::Swish; +// The SiLU function is a special case of Swish function, +// The Swish function is parametrized by b, which is set to 1.0 for SiLU. They are defined as: +// SiLU(x) = x * sigmoid(x) +// Swish(x) = x * sigmoid(bx) +// The default value of b is 1.0 in ck::tensor_operation::element_wise::Swish function. We treat them as the same function here. +using Silu = ck::tensor_operation::element_wise::Swish; using Pass = ck::tensor_operation::element_wise::PassThrough; constexpr int Rank = 5; constexpr int NumReduceDim = 3; -template +template auto GetCKGroupNormNHWCTypeStringAndOps() { using XDataType = typename CKDataTypeAdaptor::type; using YDataType = typename CKDataTypeAdaptor::type; @@ -40,26 +45,30 @@ auto GetCKGroupNormNHWCTypeStringAndOps() { using GammaDataType = float; using BetaDataType = float; - using Activation = std::conditional_t; + using Activation = std::conditional_t; - std::vector>>> ret; + std::vector>>> ret; for (auto&& impl : internal::GetDeviceGroupNormInstances()) { - std::string swish_suffix = WithSwish ? "_Swish" : "_Pass"; - auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + swish_suffix; + std::string silu_suffix = WithSilu ? "_Silu" : "_Pass"; + auto type_string = onnxruntime::MakeString(impl->GetTypeString()) + silu_suffix; auto invoker = impl->MakeInvokerPointer(); - auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)](const GroupNormNHWCParams* params) -> Status { - if constexpr (WithSwish) { + auto ck_group_norm_op = [impl = std::move(impl), invoker = std::move(invoker)]( + const GroupNormNHWCTunableParams* params) -> Status { + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr), + "Input skip or bias is not supported by composable kernel."); + if constexpr (WithSilu) { TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( - !params->withSwish, "Swish version only support groupnorm with swish"); + !params->use_silu, "Silu version only support groupnorm with silu"); } else { TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( - params->withSwish, "Pass version only support groupnorm without swish"); + params->use_silu, "Pass version only support groupnorm without silu"); } - std::vector in_lengths{params->n, params->h, params->w, params->groups, params->cPerGroup}; - std::vector in_out_strides{params->h * params->w * params->c, params->w * params->c, params->c, params->cPerGroup, 1}; - std::vector gamma_beta_strides{0, 0, 0, params->cPerGroup, 1}; + std::vector in_lengths{params->n, params->h, params->w, params->groups, params->channels_per_group}; + std::vector in_out_strides{params->h * params->w * params->c, params->w * params->c, + params->c, params->channels_per_group, 1}; + std::vector gamma_beta_strides{0, 0, 0, params->channels_per_group, 1}; std::vector reduce_dims{1, 2, 4}; auto activation = Activation{}; diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh index 19b081881dcec..4cb371fdcf960 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl.cuh @@ -18,7 +18,7 @@ namespace internal { using F16 = ck::half_t; using F32 = float; -using Swish = ck::tensor_operation::element_wise::Swish; +using Silu = ck::tensor_operation::element_wise::Swish; using Pass = ck::tensor_operation::element_wise::PassThrough; using ck::tensor_operation::device::DeviceNormalizationFwd; // the interface @@ -101,9 +101,9 @@ GetDeviceGroupNormInstances() { template <> std::vector>> + F16, F32, F32, F16, F32, Silu, 5, 3>>> GetDeviceGroupNormInstances< - F16, F32, F32, F16, F32, Swish, 5, 3>(); + F16, F32, F32, F16, F32, Silu, 5, 3>(); template <> std::vector std::vector>> + F32, F32, F32, F32, F32, Silu, 5, 3>>> GetDeviceGroupNormInstances< - F32, F32, F32, F32, F32, Swish, 5, 3>(); + F32, F32, F32, F32, F32, Silu, 5, 3>(); template <> std::vector -std::vector>> -GetDeviceGroupNormInstances() { - std::vector>> instances; +std::vector>> +GetDeviceGroupNormInstances() { + std::vector>> instances; ck::tensor_operation::device::instance::add_device_operation_instances( instances, - device_normalization_f16_instances{}); + device_normalization_f16_instances{}); return instances; } diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu index 9b0ccab17b4c1..ceb53ed442abc 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck_impl/impl_fp32.cu @@ -11,12 +11,12 @@ namespace rocm { namespace internal { template <> -std::vector>> -GetDeviceGroupNormInstances() { - std::vector>> instances; +std::vector>> +GetDeviceGroupNormInstances() { + std::vector>> instances; ck::tensor_operation::device::instance::add_device_operation_instances( instances, - device_normalization_f32_instances{}); + device_normalization_f32_instances{}); return instances; } diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h index 008ae20b0561f..7cff640db2f34 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_common.h @@ -8,110 +8,47 @@ #include "core/providers/rocm/cu_inc/common.cuh" #include "core/providers/rocm/rocm_common.h" #include "core/providers/rocm/tunable/rocm_tunable.h" +#include "contrib_ops/rocm/diffusion/group_norm_common_base.h" namespace onnxruntime { namespace contrib { namespace rocm { -using onnxruntime::rocm::CeilDiv; - -int32_t findMaxDivisor(int32_t n, int32_t maxAllowedDivisor) { - int32_t maxDivisor = -1; - for (int32_t i = 1; i <= std::sqrt(n); i++) { - if (n % i == 0) { - int32_t divisor1 = n / i; - int32_t divisor2 = i; - - if (divisor1 > maxDivisor && divisor1 < maxAllowedDivisor) { - maxDivisor = divisor1; - } - if (divisor2 > maxDivisor && divisor2 < maxAllowedDivisor) { - maxDivisor = divisor2; - } - } - } - return maxDivisor; -} - template -struct GroupNormNHWCParams : OpParams { - GroupNormNHWCParams(RocmTuningContext* tuning_ctx, onnxruntime::Stream* stream, T* dst, float* redBuffer, const T* src, const float* gamma, - const float* beta, int32_t n, int32_t h, int32_t w, int32_t c, int32_t groups, float epsilon, bool withSwish) - : OpParams(tuning_ctx, stream), dst(dst), src(src), gamma(gamma), beta(beta), redBuffer(redBuffer), epsilon(epsilon), n(n), h(h), w(w), c(c), groups(groups), withSwish(withSwish) { - int32_t maxBlocksPerHW = 1024; - switch (c) { - case 960: - case 1920: - cPerBlock = 480; - break; - case 512: - case 256: - cPerBlock = 256; - break; - case 128: - cPerBlock = 128; - break; - default: - cPerBlock = 320; - } - - hw = h * w; - const int32_t blocksPerHW = findMaxDivisor(hw, maxBlocksPerHW); - hwPerBlock = CeilDiv(hw, blocksPerHW); - cPerGroup = c / groups; - hwc = hw * c; - invHWC = 1.F / (float)(hw * cPerGroup); - groupsPerBlock = cPerBlock / cPerGroup; - } +struct GroupNormNHWCTunableParams : OpParams, GroupNormNHWCParams { + GroupNormNHWCTunableParams(RocmTuningContext* tuning_ctx, + onnxruntime::Stream* ort_stream, + T* output, + T* add_out, + const T* input, + const T* skip, + const T* bias, + const float* gamma, + const float* beta, + float* workspace, + float epsilon, + int batch_size, + int num_channels, + int height, + int width, + int num_groups, + bool use_silu, + bool broadcast_skip, + int channels_per_block) + : OpParams(tuning_ctx, ort_stream), + GroupNormNHWCParams(output, add_out, input, skip, bias, gamma, beta, workspace, epsilon, batch_size, + num_channels, height, width, num_groups, use_silu, broadcast_skip, channels_per_block) {} std::string Signature() const override { - std::string swish_suffix = withSwish ? "_Swish" : "_Pass"; - std::string sig = std::to_string(n) + "_" + std::to_string(h * w) + "_" + std::to_string(c) + "_" + std::to_string(groups) + swish_suffix; + std::string silu_suffix = this->use_silu ? "_silu" : "_pass"; + std::string skip_suffix = this->skip != nullptr ? "_skip" : "_noskip"; + std::string broadcast_suffix = this->broadcast_skip ? "_broadcast" : "_nobroadcast"; + std::string bias_suffix = this->bias != nullptr ? "_bias" : "_nobias"; + std::string sig = std::to_string(this->n) + "_" + std::to_string(this->h * this->w) + "_" + + std::to_string(this->c) + "_" + std::to_string(this->groups) + silu_suffix + + skip_suffix + broadcast_suffix + bias_suffix; return sig; } - - // The output buffer. Layout NHWC. - T* dst; - // The input buffer. Layout NHWC. - T const* src; - // The gamma scaling factor. - float const* gamma; - // The beta term to add in GN. - float const* beta; - // The temporary buffer to do the global parallel reduction. Size: - // BLOCKS_PER_BATCH x C x 2. - float* redBuffer; - float epsilon; - - // The number of instances in the batch. - int32_t n; - // The height and width of each activation map. - int32_t h; - int32_t w; - // The number of channels. - int32_t c; - // The number of groups. - int32_t groups; - // Do we apply the Swish activation function? - bool withSwish; - - // Precomputed values and parameters to control the execution of the kernels. - - // The number of activations per instance (h * w) and the number of - // activations per block. - int32_t hw; - int32_t hwPerBlock; - // The number of channels per group and blocks per activation in the C - // dimension. - int32_t cPerBlock; - int32_t cPerGroup; - - // The precomputed stride between instances. - int32_t hwc; - // The inverse of hwc in floats (to compute mean/var). - float invHWC; - // The precomputed number of groups per block. - int32_t groupsPerBlock; }; } // namespace rocm diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu index dbd5009e63676..142aaf14e8d2d 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.cu @@ -15,9 +15,12 @@ namespace rocm { template Status LaunchGroupNormKernel( RocmTuningContext* tuning_ctx, - Stream* stream, + Stream* ort_stream, T* output, + T* add_out, const T* input, + const T* skip, + const T* bias, const float* gamma, const float* beta, void* workspace, @@ -27,19 +30,26 @@ Status LaunchGroupNormKernel( int height, int width, int num_groups, - bool use_swish_activation) { - if (batch_size > static_cast(kMaxGroupNormBatchSize)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED, - "only support batch_size <= 32. Got", batch_size); - } + bool use_silu, + bool broadcast_skip, + int channels_per_block) { + GroupNormNHWCTunableParams params(tuning_ctx, ort_stream, output, add_out, input, skip, bias, gamma, beta, + reinterpret_cast(workspace), epsilon, batch_size, num_channels, + height, width, num_groups, use_silu, broadcast_skip, channels_per_block); - if (num_groups != static_cast(kGroupNormNumberOfGroups)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED, - "only num_groups=32 is supported. Got", num_groups); + if (params.channels_per_block % params.channels_per_group != 0 || + params.channels_per_block > kMaxSize || + (params.channels_per_group % CHANNELS_PER_THREAD != 0)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, + "GroupNorm in ROCM does not support the input: n=", batch_size, + " h=", height, + " w=", width, + " c=", num_channels, + " groups=", num_groups); } - GroupNormNHWCParams params(tuning_ctx, stream, output, reinterpret_cast(workspace), input, gamma, beta, - batch_size, height, width, num_channels, num_groups, epsilon, use_swish_activation); + HIP_RETURN_IF_ERROR(hipMemsetAsync( + params.group_sum_buffer, 0, GetGroupNormWorkspaceSizeInBytes(batch_size, num_groups), params.StreamHandle())); if (tuning_ctx->IsTunableOpEnabled()) { static GroupNormNHWCTunableOp op; @@ -50,14 +60,17 @@ Status LaunchGroupNormKernel( } template Status LaunchGroupNormKernel(RocmTuningContext* tuning_ctx, Stream* stream, half* output, - const half* input, const float* gamma, const float* beta, void* workspace, - float epsilon, int batch_size, int num_channels, - int height, int width, int num_groups, bool swish); + half* add_out, const half* input, const half* skip, const half* bias, + const float* gamma, const float* beta, void* workspace, float epsilon, + int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block); template Status LaunchGroupNormKernel(RocmTuningContext* tuning_ctx, Stream* stream, float* output, - const float* input, const float* gamma, const float* beta, void* workspace, - float epsilon, int batch_size, int num_channels, - int height, int width, int num_groups, bool swish); + float* add_out, const float* input, const float* skip, const float* bias, + const float* gamma, const float* beta, void* workspace, float epsilon, + int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block); + } // namespace rocm } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h deleted file mode 100644 index a0f7e0aca5def..0000000000000 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include - -#include "core/common/common.h" -#include "core/common/status.h" -#include "core/providers/rocm/tunable/rocm_tunable.h" - -using onnxruntime::rocm::tunable::RocmTuningContext; - -namespace onnxruntime { -namespace contrib { -namespace rocm { - -constexpr size_t kMaxGroupNormBatchSize = 32; -constexpr size_t kGroupNormNumberOfGroups = 32; - -constexpr size_t GetGroupNormWorkspaceSizeInBytes() { - // Two buffers for sum and squared sum - return (sizeof(float) * 2) * kMaxGroupNormBatchSize * kGroupNormNumberOfGroups; -} - -template -Status LaunchGroupNormKernel( - RocmTuningContext* tuning_ctx, - Stream* stream, - T* output, // normalized output tensor - const T* input, // input tensor - const float* gamma, // gamma (also known as weight or scale) - const float* beta, // beta (also known as bias) - void* workspace, // Work space - float epsilon, // epsilon used normalization - int batch_size, // N - int num_channels, // C - int height, // H - int width, // W - int num_groups, // number of groups - bool use_swish_activation // Whether there is Swish activation after group normalization -); - -} // namespace rocm -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh deleted file mode 100644 index d6322a12a9363..0000000000000 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_impl_kernel.cuh +++ /dev/null @@ -1,213 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -// The ROCm kernel is modified from TensorRT 8.5. -/* - * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include "core/providers/rocm/cu_inc/common.cuh" -#include "core/providers/rocm/rocm_common.h" - -namespace onnxruntime { -namespace contrib { -namespace rocm { - -static inline __device__ __host__ float sigmoid(float x) { - return 1.F / (1.F + expf(-x)); -} - -struct GroupSums { - // Is it the 1st element of the group? - int32_t flag; - // The sum. - float sum; - // The sum of squares. - float sumSq; -}; - -struct GroupSumsOp { - inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) { - GroupSums dst; - dst.sum = b.flag ? b.sum : (a.sum + b.sum); - dst.sumSq = b.flag ? b.sumSq : (a.sumSq + b.sumSq); - dst.flag = a.flag + b.flag; - return dst; - } -}; - -template -inline __device__ void UpdateSum(const T* src, int64_t offset, U& sum, U& sumSq) { - using VecT = onnxruntime::rocm::aligned_vector; - const VecT input_v = *reinterpret_cast(src + offset); - -#pragma unroll - for (int i = 0; i < ILP; i++) { - const U val = static_cast(input_v.val[i]); - sum += val; - sumSq += val * val; - } -} - -template -__global__ void groupNormNHWCSumKernel(const T* src, float* redBuffer, int32_t cPerBlock, int32_t hwPerBlock, int32_t hw, - int32_t hwc, int32_t c, int32_t cPerGroup, int32_t groups, int32_t groupsPerBlock) { - // The object in charge of doing the sums for the different blocks. - typedef hipcub::BlockScan BlockScan; - - // Allocate shared memory for BlockScan. - __shared__ typename BlockScan::TempStorage tempStorage; - // Allocate shared memory for the groups. We could reduce the amount of shared - // memory reserved. - __shared__ float2 smem[ThreadsPerBlock]; - - // The instance in the batch. - int32_t ni = blockIdx.z; - // The channel loaded by that thread (ILP channels per thread). - int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP; - - // The first activation loaded by that block. - int32_t hwBegin = blockIdx.y * hwPerBlock; - // The last activation loaded by that block. - int32_t hwEnd = min(hwBegin + hwPerBlock, hw); - - // The sums. - float sum = 0.F; - float sumSq = 0.F; - - // Iterate over the activations to compute the sums. - if (ci < c) { - for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) { - // The offset. - int64_t offset = static_cast(ni) * hwc + static_cast(hwi) * c + ci; - UpdateSum(src, offset, sum, sumSq); - } - } - - // The group that thread works on and the channel in the group (modulus). - int32_t gi = threadIdx.x * ILP / cPerGroup; - int32_t cj = threadIdx.x * ILP - cPerGroup * gi; - - // The data for the summations. - GroupSums inp{cj == 0 ? 1 : 0, sum, sumSq}; - - // Do the segmented scan. - GroupSums out; - BlockScan(tempStorage).InclusiveScan(inp, out, GroupSumsOp()); - - // Store the results for the groups in shared memory (to produce coalesced - // stores later). - if (cj == cPerGroup - ILP) { // ILP channels per thread - smem[gi] = make_float2(out.sum, out.sumSq); - } - - // Make sure the data is in shared memory. - __syncthreads(); - - // The global group index. - int32_t gj = blockIdx.x * groupsPerBlock + threadIdx.x; - - // Threads that have nothing left to do, exit. - if (threadIdx.x >= groupsPerBlock || gj >= groups) { - return; - } - - // The first threads (those storing to global memory, load the values). - float2 sums = smem[threadIdx.x]; - - // Store to global memory. - atomicAdd(&redBuffer[(2 * ni + 0) * groups + gj], sums.x); - atomicAdd(&redBuffer[(2 * ni + 1) * groups + gj], sums.y); -} - -template -__device__ void computeGroupNorm(const T* src, T* dst, int64_t offset, U mean, U invStdDev, - const U* gamma_v, const U* beta_v, bool swish) { - using VecT = onnxruntime::rocm::aligned_vector; - const VecT input_v = *reinterpret_cast(src + offset); - VecT output_v; - -#pragma unroll - for (int i = 0; i < ILP; i++) { - U val = static_cast(input_v.val[i]); - val = (val - mean) * invStdDev; - val = gamma_v[i] * val + beta_v[i]; - - if (swish) { - val = val * sigmoid(val); - } - output_v.val[i] = static_cast(val); - } - *(reinterpret_cast(dst + offset)) = output_v; -} - -template -__global__ void groupNormNHWCScaleKernel(T* dst, const T* src, const float* gamma, const float* beta, const float* redBuffer, float epsilon, int32_t c, int32_t cPerBlock, - int32_t cPerGroup, int32_t groups, int32_t hwc, float invHWC, int32_t hw, int32_t hwPerBlock, bool withSwish) { - // The channel loaded by that thread (ILP channels per thread for F16x2). - int32_t ci = blockIdx.x * cPerBlock + threadIdx.x * ILP; - if (ci >= c) { - return; - } - - // The instance in the batch. - int32_t ni = blockIdx.z; - - // The group that thread works on and the channel in the group (modulus). - int32_t gi = ci / cPerGroup; - - // Load the sum and sum of squares for the group. - float sum = 0.F, sumSq = 0.F; - if (gi < groups) { - sum = redBuffer[(2 * ni + 0) * groups + gi]; - sumSq = redBuffer[(2 * ni + 1) * groups + gi]; - } - - using VecF = onnxruntime::rocm::aligned_vector; - - const VecF gamma_v = *reinterpret_cast(gamma + ci); - const VecF beta_v = *reinterpret_cast(beta + ci); - - // Compute the mean. - float mean = sum * invHWC; - // Compute the variance. - float var = sumSq * invHWC - (mean * mean); - // Compute the inverse of the stddev. - float invStdDev = var <= 0.F ? 1.F : rsqrtf(var + epsilon); - - // The first activation loaded by that block. - int32_t hwBegin = blockIdx.y * hwPerBlock; - // The last activation loaded by that block. - int32_t hwEnd = min(hwBegin + hwPerBlock, hw); - - // Iterate over the activations to compute the sums. - for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) { - // The src/dst offset. - int64_t offset = (int64_t)ni * hwc + hwi * c + ci; - - // Fetch ILP channels per thread. - computeGroupNorm(src, dst, offset, mean, invStdDev, gamma_v.val, beta_v.val, withSwish); - } -} - -} // namespace rocm -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh index b7b9441ac997d..b3d3e92209b39 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh @@ -20,21 +20,21 @@ namespace rocm { namespace { -template +template std::string GetGroupNormTritonGroupName() { std::string ret = "GroupNormTriton_"; - std::string swish_suffix = WithSwish ? "Swish_" : "Pass_"; - ret += swish_suffix; + std::string silu_suffix = WithSilu ? "Silu_" : "Pass_"; + ret += silu_suffix; ret += GetDataTypeName(); return ret; } } // namespace -template +template auto GetTritonGroupNormNHWCTypeStringAndOps() { - std::vector>>> ret; - auto group_name = GetGroupNormTritonGroupName(); + std::vector>>> ret; + auto group_name = GetGroupNormTritonGroupName(); auto* kernel_list = GetOrtTritonKernelByGroup(group_name); if (kernel_list == nullptr) { return ret; @@ -45,16 +45,19 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() { auto* metadata = GetOrtTritonKernelMetadata(i); auto block_size = metadata->constants.at("BLOCK_SIZE"); auto hw_size = metadata->constants.at("HW_SIZE"); - auto impl = [i, block_size, hw_size](const GroupNormNHWCParams* params) -> Status { + auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams* params) -> Status { + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr), + "Input skip or bias is not supported by triton kernel."); TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( - params->cPerGroup > block_size || params->cPerGroup * 2 <= block_size, - "Arg block_size (", block_size, ") is not the next power of 2 of cPerGroup (", params->cPerGroup, ")."); + params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size, + "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (", + params->channels_per_group, ")."); TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( params->hw % hw_size != 0, "Arg hw_size (", hw_size, ") is not a divisor of hw (", params->hw, ")."); - if constexpr (WithSwish) { - TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->withSwish, "Swish version does not support GN w/o swish."); + if constexpr (WithSilu) { + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!params->use_silu, "Silu version does not support GN w/o silu."); } else { - TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->withSwish, "Pass version does not support GN w/ swish."); + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(params->use_silu, "Pass version does not support GN w/ silu."); } // Construct args for launch kernel struct { @@ -73,7 +76,7 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() { (const void*)params->beta, params->hw, params->c, - params->cPerGroup, + params->channels_per_group, params->epsilon}; // Grid dim is (batch_count, groups, 1) diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py index 56b3a030b289e..5368cb1cf635b 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py @@ -21,7 +21,7 @@ def group_norm_kernel( eps, BLOCK_SIZE: tl.constexpr, HW_SIZE: tl.constexpr, - ACTIVATION_SWISH: tl.constexpr, + ACTIVATION_SILU: tl.constexpr, ): row_x = tl.program_id(0) row_y = tl.program_id(1) @@ -62,7 +62,7 @@ def group_norm_kernel( x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) x_hat = (x - group_mean) * rstd y = x_hat * gamma + beta - if ACTIVATION_SWISH: + if ACTIVATION_SILU: y *= tl.sigmoid(y) tl.store(y_ptr + offsets, y, mask=mask) @@ -71,7 +71,7 @@ def group_norm_kernel( # blocks = [16, 32, 64, 128, 256, 512] # hw_sizes = [8, 16, 32, 64, 128, 256, 512] # but this will result in too many functions and slow down the compilation. -with_swish = [True, False] +with_silu = [True, False] dtypes = ["fp32", "fp16"] blocks = [16, 32, 64, 128] hw_sizes = [8, 16, 32, 64, 128, 256] @@ -84,14 +84,14 @@ def group_norm_kernel( def get_function_table(): func_table = [] - for swish, dtype, hw_size, warp, b in product(with_swish, dtypes, hw_sizes, warps, blocks): - swish_suffix = "Swish" if swish else "Pass" - name = name_pattern.format(swish_suffix, dtype, b, hw_size, warp) - group = group_pattern.format(swish_suffix, dtype) + for silu, dtype, hw_size, warp, b in product(with_silu, dtypes, hw_sizes, warps, blocks): + silu_suffix = "Silu" if silu else "Pass" + name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp) + group = group_pattern.format(silu_suffix, dtype) sig = sig_pattern.format(dtype, dtype) kwargs = { "num_warps": warp, - "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SWISH": int(swish)}, + "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)}, } func_desc = {"name": name, "group": group, "func": group_norm_kernel, "sig": sig, "kwargs": kwargs} func_table.append(func_desc) diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h index 25d820f7ed326..e6831f764b418 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_tunable_op.h @@ -20,115 +20,117 @@ namespace rocm { using onnxruntime::rocm::GPU_WARP_SIZE; template -void groupNormNHWCSum(const GroupNormNHWCParams* params) { - // Make sure the values are as we expect. - ORT_ENFORCE(params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0); - // Make sure a group does not span multiple blocks. - ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0); - +void GroupNormNHWCSum(const GroupNormNHWCTunableParams* params) { dim3 grid; // The number of blocks to compute all the channels. - grid.x = params->c / params->cPerBlock; + grid.x = DivUp(params->c, params->channels_per_block); // The number of blocks to compute all the activations in a given instance. - grid.y = CeilDiv(params->hw, params->hwPerBlock); + grid.y = DivUp(params->hw, params->hw_per_block); // The number of instances. grid.z = params->n; -#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize) \ - groupNormNHWCSumKernel \ - <<StreamHandle()>>>( \ - params->src, params->redBuffer, params->cPerBlock, \ - params->hwPerBlock, params->hw, params->hwc, params->c, \ - params->cPerGroup, params->groups, params->groupsPerBlock); \ +#define LAUNCH_GROUPNORM_SUM(ThreadsPerBlock, VecSize) \ + GroupNormNHWCSumKernel \ + <<StreamHandle()>>>( \ + params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias, \ + params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c, \ + params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip); \ break; - switch (params->cPerBlock) { - case 320: - LAUNCH_GROUPNORM_SUM(256, 2) - case 480: - LAUNCH_GROUPNORM_SUM(256, 2) + // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2. + switch (params->threads_per_block) { case 256: - LAUNCH_GROUPNORM_SUM(128, 2) + LAUNCH_GROUPNORM_SUM(256, CHANNELS_PER_THREAD) + case 192: + LAUNCH_GROUPNORM_SUM(192, CHANNELS_PER_THREAD) + case 160: + LAUNCH_GROUPNORM_SUM(160, CHANNELS_PER_THREAD) case 128: - LAUNCH_GROUPNORM_SUM(64, 2) + LAUNCH_GROUPNORM_SUM(128, CHANNELS_PER_THREAD) + case 64: + LAUNCH_GROUPNORM_SUM(64, CHANNELS_PER_THREAD) default: ORT_NOT_IMPLEMENTED("Not implemented"); } } template -Status GroupNormNHWCSumOp(const GroupNormNHWCParams* params) { +Status GroupNormNHWCSumOp(const GroupNormNHWCTunableParams* params) { dim3 grid; - grid.x = params->c / params->cPerBlock; - grid.y = CeilDiv(params->hw, params->hwPerBlock); + grid.x = DivUp(params->c, params->channels_per_block); + grid.y = DivUp(params->hw, params->hw_per_block); grid.z = params->n; - groupNormNHWCSumKernel + GroupNormNHWCSumKernel <<StreamHandle()>>>( - params->src, params->redBuffer, params->cPerBlock, params->hwPerBlock, - params->hw, params->hwc, params->c, params->cPerGroup, params->groups, params->groupsPerBlock); + params->skip_workspace, params->group_sum_buffer, params->src, params->skip, params->bias, + params->channels_per_block, params->hw_per_block, params->hw, params->hwc, params->c, + params->channels_per_group, params->groups, params->groups_per_block, params->broadcast_skip); return HIP_CALL(hipGetLastError()); } template -void groupNormNHWCScale(const GroupNormNHWCParams* params) { - // Make sure the dimensions are aligned with what we expect. - ORT_ENFORCE(params->c % params->cPerBlock == 0); - // Make sure a group does not span multiple blocks. - ORT_ENFORCE(params->cPerBlock % params->cPerGroup == 0); - +void GroupNormNHWCScale(const GroupNormNHWCTunableParams* params) { dim3 grid; // The number of blocks to compute all the channels. - grid.x = params->c / params->cPerBlock; + grid.x = DivUp(params->c, params->channels_per_block); // The number of blocks to compute all the activations in a given instance. - grid.y = CeilDiv(params->hw, params->hwPerBlock); + grid.y = DivUp(params->hw, params->hw_per_block); // The number of instances. grid.z = params->n; -#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize) \ - groupNormNHWCScaleKernel \ - <<StreamHandle()>>>( \ - params->dst, params->src, params->gamma, params->beta, \ - params->redBuffer, params->epsilon, params->c, params->cPerBlock, \ - params->cPerGroup, params->groups, params->hwc, params->invHWC, \ - params->hw, params->hwPerBlock, params->withSwish); \ +#define LAUNCH_GROUPNORM_SCALE(ThreadsPerBlock, VecSize) \ + GroupNormNHWCScaleKernel \ + <<StreamHandle()>>>( \ + params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace, \ + params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block, \ + params->channels_per_group, params->groups, params->hwc, params->inv_hw_channels_per_group, \ + params->hw, params->hw_per_block, params->use_silu); \ break; - switch (params->cPerBlock) { - case 320: - LAUNCH_GROUPNORM_SCALE(256, 2) - case 480: - LAUNCH_GROUPNORM_SCALE(256, 2) + // Threads_per_block is half of values in kSizes since CHANNELS_PER_THREAD = 2. + switch (params->threads_per_block) { case 256: - LAUNCH_GROUPNORM_SCALE(128, 2) + LAUNCH_GROUPNORM_SCALE(256, CHANNELS_PER_THREAD) + case 192: + LAUNCH_GROUPNORM_SCALE(192, CHANNELS_PER_THREAD) + case 160: + LAUNCH_GROUPNORM_SCALE(160, CHANNELS_PER_THREAD) case 128: - LAUNCH_GROUPNORM_SCALE(64, 2) + LAUNCH_GROUPNORM_SCALE(128, CHANNELS_PER_THREAD) + case 64: + LAUNCH_GROUPNORM_SCALE(64, CHANNELS_PER_THREAD) default: ORT_NOT_IMPLEMENTED("Not implemented"); } } template -Status GroupNormNHWCScaleOp(const GroupNormNHWCParams* params) { +Status GroupNormNHWCScaleOp(const GroupNormNHWCTunableParams* params) { dim3 grid; - grid.x = params->c / params->cPerBlock; - grid.y = CeilDiv(params->hw, params->hwPerBlock); + grid.x = DivUp(params->c, params->channels_per_block); + grid.y = DivUp(params->hw, params->hw_per_block); grid.z = params->n; - groupNormNHWCScaleKernel + GroupNormNHWCScaleKernel <<StreamHandle()>>>( - params->dst, params->src, params->gamma, params->beta, params->redBuffer, params->epsilon, params->c, params->cPerBlock, - params->cPerGroup, params->groups, params->hwc, params->invHWC, params->hw, params->hwPerBlock, params->withSwish); + params->dst, params->src, params->skip, params->gamma, params->beta, params->skip_workspace, + params->group_sum_buffer, params->epsilon, params->c, params->channels_per_block, params->channels_per_group, + params->groups, params->hwc, params->inv_hw_channels_per_group, params->hw, params->hw_per_block, + params->use_silu); return HIP_CALL(hipGetLastError()); } template class GroupNormNHWCOp { public: - Status operator()(const GroupNormNHWCParams* params) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle())); + Status operator()(const GroupNormNHWCTunableParams* params) { + HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer, + 0, + GetGroupNormWorkspaceSizeInBytes(params->n, params->groups), + params->StreamHandle())); auto status = GroupNormNHWCSumOp(params); ORT_RETURN_IF_ERROR(status); HIP_RETURN_IF_ERROR(hipGetLastError()); @@ -138,29 +140,30 @@ class GroupNormNHWCOp { return Status::OK(); } - Status IsSupported(const GroupNormNHWCParams* params) { + Status IsSupported(const GroupNormNHWCTunableParams* params) { TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( - !(params->c % VecSize == 0 && params->cPerGroup % VecSize == 0), - "The number of channels (", params->c, ") or the number of channels per group (", params->cPerGroup, + !(params->c % VecSize == 0 && params->channels_per_group % VecSize == 0), + "The number of channels (", params->c, ") or the number of channels per group (", params->channels_per_group, ") isn't divisible by the number of vector size: ", VecSize); - TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock % params->cPerGroup == 0 && - params->c % params->cPerBlock == 0 && params->hw % params->hwPerBlock == 0), - "The value of attributes don't meet the requirements."); - TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->cPerBlock <= ThreadsPerBlock * VecSize && - params->cPerBlock > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize), + TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(!(params->channels_per_block <= ThreadsPerBlock * VecSize && + params->channels_per_block > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize), "Configuration: Threads (", ThreadsPerBlock, "), vector size (", - VecSize, ") is redundant for the number of channels per group: ", params->cPerBlock); + VecSize, ") is redundant for the number of channels per group: ", + params->channels_per_block); return Status::OK(); } }; template -Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams* params) { - HIP_RETURN_IF_ERROR(hipMemsetAsync(params->redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), params->StreamHandle())); - groupNormNHWCSum(params); +Status GroupNormNHWCStaticSelection(const GroupNormNHWCTunableParams* params) { + HIP_RETURN_IF_ERROR(hipMemsetAsync(params->group_sum_buffer, + 0, + GetGroupNormWorkspaceSizeInBytes(params->n, params->groups), + params->StreamHandle())); + GroupNormNHWCSum(params); HIP_RETURN_IF_ERROR(hipGetLastError()); - groupNormNHWCScale(params); + GroupNormNHWCScale(params); HIP_RETURN_IF_ERROR(hipGetLastError()); return Status::OK(); } @@ -178,30 +181,30 @@ Status GroupNormNHWCStaticSelection(const GroupNormNHWCParams* params) { ADD_OP_FOR_ALL_VEC_SIZE(name, 320) template -class GroupNormNHWCTunableOp : public TunableOp> { +class GroupNormNHWCTunableOp : public TunableOp> { public: GroupNormNHWCTunableOp() { this->RegisterOp(GroupNormNHWCStaticSelection); ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWCOp) #ifdef USE_COMPOSABLE_KERNEL - for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps()) { + for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps()) { ORT_UNUSED_PARAMETER(_); this->RegisterOp(std::move(op)); } - for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps()) { + for (auto&& [_, op] : GetCKGroupNormNHWCTypeStringAndOps()) { ORT_UNUSED_PARAMETER(_); this->RegisterOp(std::move(op)); } #endif // USE_COMPOSABLE_KERNEL #ifdef USE_TRITON_KERNEL - for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps()) { + for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps()) { ORT_UNUSED_PARAMETER(_); this->RegisterOp(std::move(op)); } - for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps()) { + for (auto&& [_, op] : GetTritonGroupNormNHWCTypeStringAndOps()) { ORT_UNUSED_PARAMETER(_); this->RegisterOp(std::move(op)); } diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index 55cd6a1d112f5..382a3951f3a83 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -93,6 +93,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Samp class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, SkipGroupNorm); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, SkipLayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, SkipSimplifiedLayerNormalization); @@ -246,6 +247,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py index e32cb032798fc..8334d20e47c86 100644 --- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py +++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py @@ -35,7 +35,11 @@ def sigmoid_function(x): return 1.0 / (1.0 + np.exp(-x)) -def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish): +def group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, with_silu, has_skip): + add_output = None + if has_skip: + input_x = input_x + skip_x + bias_x + add_output = input_x n, h, w, c = input_x.shape input_x = input_x.transpose([0, 3, 1, 2]) assert c % num_groups == 0 @@ -45,46 +49,70 @@ def group_norm(input_x, gamma, beta, num_groups, epsilon, with_swish): x = x.transpose([0, 2, 3, 1]) x = x * gamma + beta - if with_swish: + if with_silu: x = x * sigmoid_function(x) - return x + return x, add_output -def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func): +def run_group_norm( + batch_size: int, height: int, num_channels: int, num_groups: int, dtype: str, silu: bool, has_skip: bool, func +): np.random.seed(0) width = height input_x = np.random.rand(batch_size, height, width, num_channels).astype(np.float32) gamma = np.random.rand(num_channels).astype(np.float32) beta = np.random.rand(num_channels).astype(np.float32) # the size of workspace is defined in onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h L18 - workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * 32 * 32).astype(np.float32) + workspace = np.random.rand((np.dtype(np.float32).itemsize * 2) * batch_size * num_groups).astype(np.float32) epsilon = 1e-05 output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype) - use_swish = swish - host_x = input_x.astype(dtype) - input_d = ke.DeviceArray(host_x) + skip_x = ( + np.random.rand(batch_size, height, width, num_channels).astype(np.float32) + if has_skip + else np.empty((0), dtype=dtype) + ) + bias_x = np.random.rand(num_channels).astype(np.float32) if has_skip else np.empty((0), dtype=dtype) + add_output = ( + np.random.rand(batch_size, height, width, num_channels).astype(dtype) + if has_skip + else np.empty((0), dtype=dtype) + ) + use_silu = silu + broadcast_skip = False + channels_per_block = 0 # Compute in params initialization + + input_d = ke.DeviceArray(input_x.astype(dtype)) + skip_d = ke.DeviceArray(skip_x.astype(dtype)) + bias_d = ke.DeviceArray(bias_x.astype(dtype)) gamma_d = ke.DeviceArray(gamma) beta_d = ke.DeviceArray(beta) workspace_d = ke.DeviceArray(workspace) y_d = ke.DeviceArray(output_y) + y_add_d = ke.DeviceArray(add_output) f = getattr(ke, func) my_op = f( y_d, - workspace_d, + y_add_d, input_d, + skip_d, + bias_d, gamma_d, beta_d, + workspace_d, + epsilon, batch_size, + num_channels, height, width, - num_channels, num_groups, - epsilon, - use_swish, + use_silu, + broadcast_skip, + channels_per_block, ) - y_ref = group_norm(input_x, gamma, beta, num_groups, epsilon, use_swish).astype(dtype) + y_ref, y_add_d_ref = group_norm(input_x, skip_x, bias_x, gamma, beta, num_groups, epsilon, use_silu, has_skip) + y_ref = y_ref.astype(dtype) for impl in my_op.ListOps(): if not my_op.SelectOp(impl): @@ -95,6 +123,10 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups: y_d.UpdateHostNumpyArray() np.testing.assert_allclose(y_ref, output_y, atol=1e-02) + if has_skip: + y_add_d_ref = y_add_d_ref.astype(dtype) + y_add_d.UpdateHostNumpyArray() + np.testing.assert_allclose(y_add_d_ref, add_output, atol=1e-02) dtypes = ["float32", "float16"] @@ -102,19 +134,21 @@ def run_group_norm(batch_size: int, height: int, num_channels: int, num_groups: @pytest.mark.parametrize("sd_sizes", get_sd_sizes()) @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("swish", [True]) -def test_group_norm(sd_sizes, dtype, swish): +@pytest.mark.parametrize("silu", [True]) +@pytest.mark.parametrize("has_skip", [True, False]) +def test_group_norm(sd_sizes, dtype, silu, has_skip): for func in dtype_to_funcs(dtype): - run_group_norm(*sd_sizes, dtype, swish, func) + run_group_norm(*sd_sizes, dtype, silu, has_skip, func) @pytest.mark.parametrize("sd_sizes", get_sd_sizes()) @pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("swish", [True]) -def test_group_norm_ck(sd_sizes, dtype, swish): - swish_suffix = "Swish" if swish else "Pass" - ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype) - run_group_norm(*sd_sizes, dtype, swish, ck_f_name) +@pytest.mark.parametrize("silu", [True]) +@pytest.mark.parametrize("has_skip", [False]) +def test_group_norm_ck(sd_sizes, dtype, silu, has_skip): + silu_suffix = "Silu" if silu else "Pass" + ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype) + run_group_norm(*sd_sizes, dtype, silu, has_skip, ck_f_name) @dataclass @@ -136,37 +170,67 @@ def report(self): def profile_group_norm_func( - batch_size: int, height: int, width: int, num_channels: int, num_groups: int, dtype: str, swish: bool, func + batch_size: int, + height: int, + width: int, + num_channels: int, + num_groups: int, + dtype: str, + silu: bool, + has_skip: bool, + func, ): np.random.seed(0) input_x = np.random.rand(batch_size, height, width, num_channels).astype(dtype) gamma = np.random.rand(num_channels).astype(np.float32) beta = np.random.rand(num_channels).astype(np.float32) - workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * 32 * 32).astype(np.float32) + workspace = np.random.rand(np.dtype(np.float32).itemsize * 2 * batch_size * num_groups).astype(np.float32) epsilon = 0.05 output_y = np.random.rand(batch_size, height, width, num_channels).astype(dtype) - use_swish = swish + + skip_x = ( + np.random.rand(batch_size, height, width, num_channels).astype(dtype) + if has_skip + else np.empty((0), dtype=dtype) + ) + bias_x = np.random.rand(num_channels).astype(dtype) if has_skip else np.empty((0), dtype=dtype) + add_output = ( + np.random.rand(batch_size, height, width, num_channels).astype(dtype) + if has_skip + else np.empty((0), dtype=dtype) + ) + use_silu = silu + broadcast_skip = False + channels_per_block = 0 # Compute in params initialization input_d = ke.DeviceArray(input_x) + skip_d = ke.DeviceArray(skip_x) + bias_d = ke.DeviceArray(bias_x) gamma_d = ke.DeviceArray(gamma) beta_d = ke.DeviceArray(beta) workspace_d = ke.DeviceArray(workspace) y_d = ke.DeviceArray(output_y) + y_add_d = ke.DeviceArray(add_output) f = getattr(ke, func) my_op = f( y_d, - workspace_d, + y_add_d, input_d, + skip_d, + bias_d, gamma_d, beta_d, + workspace_d, + epsilon, batch_size, + num_channels, height, width, - num_channels, num_groups, - epsilon, - use_swish, + use_silu, + broadcast_skip, + channels_per_block, ) for impl in my_op.ListOps(): duration_ms = -1 @@ -181,14 +245,14 @@ def profile_group_norm_func( ) -def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, swish=True, sort=True): +def profile_with_args(batch_size, height, width, num_channels, num_groups, dtype, silu=True, has_skip=True, sort=True): with ke.benchmark(sort): for func in dtype_to_funcs(dtype): - profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, func) + profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, func) # ck function - swish_suffix = "Swish" if swish else "Pass" - ck_f_name = "CKGroupNormNHWC" + swish_suffix + "_" + dtype_to_suffix(dtype) - profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, swish, ck_f_name) + silu_suffix = "Silu" if silu else "Pass" + ck_f_name = "CKGroupNormNHWC" + silu_suffix + "_" + dtype_to_suffix(dtype) + profile_group_norm_func(batch_size, height, width, num_channels, num_groups, dtype, silu, has_skip, ck_f_name) sd_profile_sizes = [ @@ -227,7 +291,8 @@ def profile(): group.add_argument("num_channels", type=int) group.add_argument("num_groups", type=int) group.add_argument("dtype", choices=dtypes) - group.add_argument("--swish", action="store_true") + group.add_argument("--silu", action="store_true") + group.add_argument("--has_skip", action="store_true") group.add_argument("--sort", action="store_true") if len(sys.argv) == 1: @@ -241,6 +306,7 @@ def profile(): args.num_channels, args.num_groups, args.dtype, - args.swish, + args.silu, + args.has_skip, args.sort, ) diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu index 0bd47b2c0387e..6af163ab94b10 100644 --- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu +++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/group_norm.cu @@ -12,17 +12,21 @@ #include "python/tools/kernel_explorer/kernel_explorer_interface.h" namespace py = pybind11; - +using onnxruntime::contrib::rocm::GetGroupNormWorkspaceSizeInBytes; namespace onnxruntime { template class GroupNormNHWC : public IKernelExplorer { public: - GroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta, - int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish) - : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(workspace.ptr()), - static_cast(input.ptr()), static_cast(gamma.ptr()), static_cast(beta.ptr()), - batch_size, height, width, num_channels, num_groups, epsilon, use_swish) { + GroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, DeviceArray& bias, + DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, float epsilon, + int batch_size, int num_channels, int height, int width, int num_groups, bool use_silu, + bool broadcast_skip, int channels_per_block) + : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(add_output.ptr()), + static_cast(input.ptr()), static_cast(skip.ptr()), static_cast(bias.ptr()), + static_cast(gamma.ptr()), static_cast(beta.ptr()), static_cast(workspace.ptr()), + epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip, + channels_per_block) { type_string_ = "GroupNormNHWC_" + std::to_string(ThreadsPerBlock) + "_" + std::to_string(VecSize); } @@ -40,7 +44,7 @@ class GroupNormNHWC : public IKernelExplorer { } private: - using ParamsT = contrib::rocm::GroupNormNHWCParams; + using ParamsT = contrib::rocm::GroupNormNHWCTunableParams; ParamsT params_{}; contrib::rocm::GroupNormNHWCOp op_{}; std::string type_string_{}; @@ -49,11 +53,15 @@ class GroupNormNHWC : public IKernelExplorer { template class GroupNormNHWCStaticSelection : public IKernelExplorer { public: - GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta, - int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish) - : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(workspace.ptr()), - static_cast(input.ptr()), static_cast(gamma.ptr()), static_cast(beta.ptr()), - batch_size, height, width, num_channels, num_groups, epsilon, use_swish) { + GroupNormNHWCStaticSelection(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, + DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, + float epsilon, int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block) + : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(add_output.ptr()), + static_cast(input.ptr()), static_cast(skip.ptr()), static_cast(bias.ptr()), + static_cast(gamma.ptr()), static_cast(beta.ptr()), static_cast(workspace.ptr()), + epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip, + channels_per_block) { type_string_ = "GroupNormNHWCStaticSelection"; } @@ -71,7 +79,7 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer { } private: - using ParamsT = contrib::rocm::GroupNormNHWCParams; + using ParamsT = contrib::rocm::GroupNormNHWCTunableParams; ParamsT params_{}; std::string type_string_{}; }; @@ -79,11 +87,15 @@ class GroupNormNHWCStaticSelection : public IKernelExplorer { template class GroupNormNHWCTunable : public IKernelExplorer { public: - GroupNormNHWCTunable(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta, - int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish) - : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(workspace.ptr()), - static_cast(input.ptr()), static_cast(gamma.ptr()), static_cast(beta.ptr()), - batch_size, height, width, num_channels, num_groups, epsilon, use_swish) { + GroupNormNHWCTunable(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, + DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, + float epsilon, int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block) + : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(add_output.ptr()), + static_cast(input.ptr()), static_cast(skip.ptr()), static_cast(bias.ptr()), + static_cast(gamma.ptr()), static_cast(beta.ptr()), static_cast(workspace.ptr()), + epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip, + channels_per_block) { params_.TuningContext()->EnableTunableOpAndTuning(); } @@ -100,21 +112,25 @@ class GroupNormNHWCTunable : public IKernelExplorer { } private: - using ParamsT = contrib::rocm::GroupNormNHWCParams; + using ParamsT = contrib::rocm::GroupNormNHWCTunableParams; ParamsT params_{}; contrib::rocm::GroupNormNHWCTunableOp op_{}; }; #ifdef USE_COMPOSABLE_KERNEL -template +template class CKGroupNormNHWC : public IKernelExplorer { public: - CKGroupNormNHWC(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta, - int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish) - : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(workspace.ptr()), - static_cast(input.ptr()), static_cast(gamma.ptr()), static_cast(beta.ptr()), - batch_size, height, width, num_channels, num_groups, epsilon, use_swish) { - for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps()) { + CKGroupNormNHWC(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, + DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, + float epsilon, int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block) + : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(add_output.ptr()), + static_cast(input.ptr()), static_cast(skip.ptr()), static_cast(bias.ptr()), + static_cast(gamma.ptr()), static_cast(beta.ptr()), static_cast(workspace.ptr()), + epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip, + channels_per_block) { + for (auto&& [type_string, op] : contrib::rocm::GetCKGroupNormNHWCTypeStringAndOps()) { type_strings_.emplace_back(std::move(type_string)); ops_.emplace_back(std::move(op)); } @@ -141,7 +157,7 @@ class CKGroupNormNHWC : public IKernelExplorer { } private: - using ParamsT = contrib::rocm::GroupNormNHWCParams; + using ParamsT = contrib::rocm::GroupNormNHWCTunableParams; using OpT = rocm::tunable::Op; ParamsT params_{}; std::vector ops_; @@ -151,15 +167,19 @@ class CKGroupNormNHWC : public IKernelExplorer { #endif // USE_COMPOSABLE_KERNEL #ifdef USE_TRITON_KERNEL -template +template class GroupNormNHWCTriton : public IKernelExplorer { public: - GroupNormNHWCTriton(DeviceArray& output, DeviceArray& workspace, DeviceArray& input, DeviceArray& gamma, DeviceArray& beta, - int batch_size, int height, int width, int num_channels, int num_groups, float epsilon, bool use_swish) - : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(workspace.ptr()), - static_cast(input.ptr()), static_cast(gamma.ptr()), static_cast(beta.ptr()), - batch_size, height, width, num_channels, num_groups, epsilon, use_swish) { - for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps()) { + GroupNormNHWCTriton(DeviceArray& output, DeviceArray& add_output, DeviceArray& input, DeviceArray& skip, + DeviceArray& bias, DeviceArray& gamma, DeviceArray& beta, DeviceArray& workspace, + float epsilon, int batch_size, int num_channels, int height, int width, int num_groups, + bool use_silu, bool broadcast_skip, int channels_per_block) + : params_(TuningContext(), Stream(), static_cast(output.ptr()), static_cast(add_output.ptr()), + static_cast(input.ptr()), static_cast(skip.ptr()), static_cast(bias.ptr()), + static_cast(gamma.ptr()), static_cast(beta.ptr()), static_cast(workspace.ptr()), + epsilon, batch_size, num_channels, height, width, num_groups, use_silu, broadcast_skip, + channels_per_block) { + for (auto&& [name, op] : contrib::rocm::GetTritonGroupNormNHWCTypeStringAndOps()) { name_strings_.emplace_back(name); ops_.emplace_back(std::move(op)); } @@ -186,7 +206,7 @@ class GroupNormNHWCTriton : public IKernelExplorer { } private: - using ParamsT = contrib::rocm::GroupNormNHWCParams; + using ParamsT = contrib::rocm::GroupNormNHWCTunableParams; using OpT = rocm::tunable::Op; ParamsT params_{}; std::vector ops_; @@ -198,7 +218,8 @@ class GroupNormNHWCTriton : public IKernelExplorer { #define REGISTER_OP(name, type, threads_per_block, vec_size) \ py::class_>(m, #name "_" #type "_" #threads_per_block "_" #vec_size) \ .def(py::init()) \ + DeviceArray&, DeviceArray&, DeviceArray&, float, \ + int, int, int, int, int, bool, bool, int>()) \ .def("SetRepeats", &name::SetRepeats) \ .def("Profile", &name::Profile) \ .def("Run", &name::Run) \ @@ -220,7 +241,8 @@ class GroupNormNHWCTriton : public IKernelExplorer { #define REGISTER_COMMON(name, type, ...) \ py::class_>(m, name) \ .def(py::init()) \ + DeviceArray&, DeviceArray&, DeviceArray&, float, \ + int, int, int, int, int, bool, bool, int>()) \ .def("SetRepeats", &type<__VA_ARGS__>::SetRepeats) \ .def("Profile", &type<__VA_ARGS__>::Profile) \ .def("Run", &type<__VA_ARGS__>::Run) \ @@ -230,11 +252,11 @@ class GroupNormNHWCTriton : public IKernelExplorer { #define REGISTER_OP_TYPED(name, type) \ REGISTER_COMMON(#name "_" #type, name, type) -#define REGISTER_CK(type, with_swish, swish_suffix) \ - REGISTER_COMMON("CKGroupNormNHWC" swish_suffix "_" #type, CKGroupNormNHWC, type, with_swish) +#define REGISTER_CK(type, with_silu, silu_suffix) \ + REGISTER_COMMON("CKGroupNormNHWC" silu_suffix "_" #type, CKGroupNormNHWC, type, with_silu) -#define REGISTER_TRITON(type, with_swish, swish_suffix) \ - REGISTER_COMMON("GroupNormNHWCTriton" swish_suffix "_" #type, GroupNormNHWCTriton, type, with_swish) +#define REGISTER_TRITON(type, with_silu, silu_suffix) \ + REGISTER_COMMON("GroupNormNHWCTriton" silu_suffix "_" #type, GroupNormNHWCTriton, type, with_silu) KE_REGISTER(m) { REGISTER_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(GroupNormNHWC, half); @@ -248,16 +270,16 @@ KE_REGISTER(m) { #ifdef USE_COMPOSABLE_KERNEL REGISTER_CK(half, false, "Pass"); - REGISTER_CK(half, true, "Swish"); + REGISTER_CK(half, true, "Silu"); REGISTER_CK(float, false, "Pass"); - REGISTER_CK(float, true, "Swish"); + REGISTER_CK(float, true, "Silu"); #endif // USE_COMPOSABLE_KERNEL #ifdef USE_TRITON_KERNEL REGISTER_TRITON(half, false, "Pass"); - REGISTER_TRITON(half, true, "Swish"); + REGISTER_TRITON(half, true, "Silu"); REGISTER_TRITON(float, false, "Pass"); - REGISTER_TRITON(float, true, "Swish"); + REGISTER_TRITON(float, true, "Silu"); #endif } diff --git a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc index fefd5722054de..ea8537f243f5d 100644 --- a/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skip_group_norm_op_test.cc @@ -114,16 +114,21 @@ TEST(SkipGroupNormTest, SkipGroupNorm_with_bias) { int min_cuda_architecture = 530; bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); std::array channels_last_values = {-1, 1}; for (const int channels_last : channels_last_values) { - if (enable_cuda) { + if (enable_cuda || enable_rocm) { std::vector> execution_providers; if (enable_cuda && channels_last != 0) { execution_providers.push_back(DefaultCudaExecutionProvider()); } + if (enable_rocm && channels_last != 0) { + execution_providers.push_back(DefaultRocmExecutionProvider()); + } + // Don't run the test if no providers are supported if (execution_providers.empty()) { continue; @@ -230,6 +235,7 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) { int min_cuda_architecture = 530; bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); std::array has_add_out_values = {true, false}; std::array skip_dims = {2, 4}; @@ -237,12 +243,16 @@ TEST(SkipGroupNormTest, SkipGroupNorm_no_bias_broadcast_skip) { constexpr int channels_last = 1; for (const int skip_dim : skip_dims) { for (const bool has_add_out : has_add_out_values) { - if (enable_cuda) { + if (enable_cuda || enable_rocm) { std::vector> execution_providers; if (enable_cuda && channels_last != 0) { execution_providers.push_back(DefaultCudaExecutionProvider()); } + if (enable_rocm && channels_last != 0) { + execution_providers.push_back(DefaultRocmExecutionProvider()); + } + // Don't run the test if no providers are supported if (execution_providers.empty()) { continue; diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index e286236ba6447..f1d3702e3245e 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -181,6 +181,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path): s = s.replace("rocm_device_prop_", "cuda_device_prop_") s = s.replace("rocm_device_arch_", "cuda_device_arch_") + s = s.replace("HipTuningContext", "RocmTuningContext") + # We want hipfft, which needs hipDataType etc, but only do this for files that have "fft" in their names # And we do this last, undoing or fixing hipify mistakes. if "fft" in src_file_path: From 124bde985ae883566c44f5cd84d351612006100c Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Tue, 20 Feb 2024 19:20:42 -0800 Subject: [PATCH 16/68] Bring QAT POC back to a functional state (#19290) --- .../test/python/qat_poc_example/README.md | 2 +- .../test/python/qat_poc_example/model.py | 56 +++++++------------ .../test/python/qat_poc_example/qat.py | 2 +- .../test/python/qat_poc_example/train.py | 18 ++---- 4 files changed, 27 insertions(+), 51 deletions(-) diff --git a/orttraining/orttraining/test/python/qat_poc_example/README.md b/orttraining/orttraining/test/python/qat_poc_example/README.md index 6840e98bd9c86..05072b410b730 100644 --- a/orttraining/orttraining/test/python/qat_poc_example/README.md +++ b/orttraining/orttraining/test/python/qat_poc_example/README.md @@ -48,7 +48,7 @@ We use `onnxruntime.training.onnxblock` to perform the above operations to get t > **_NOTE:_** As of this writing, ORT does not have its own `"Observers"`. Instead, we rely on the `onnxruntime.quantization` tool to quantize the model and give us an initial estimate of the quantization parameters using its calibration process. Here the calibration process is used as a substitute for the observers to present the POC. -> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag AddQDQPairToWeight=True` +> **_NOTE:_** Typically, the weights in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since weights are quantized. However, QAT requires weights and biases to be non quantized. We ensure that the weights have dedicated QDQ pair by passing in the flag `AddQDQPairToWeight=True` > **_NOTE:_** Typically, the bias term in the statically quantized onnx model is associated with a DQ node only (not the QDQ pair) since it is quantized as int32 as opposed to int8. So, we disable quantizing the bias term using the flag QuantizeBias=False` diff --git a/orttraining/orttraining/test/python/qat_poc_example/model.py b/orttraining/orttraining/test/python/qat_poc_example/model.py index 91d7ccd7294f5..601362a59e379 100644 --- a/orttraining/orttraining/test/python/qat_poc_example/model.py +++ b/orttraining/orttraining/test/python/qat_poc_example/model.py @@ -5,7 +5,7 @@ import onnx import torch -import onnxruntime.training.onnxblock as onnxblock +from onnxruntime.training import artifacts class MNIST(torch.nn.Module): @@ -96,42 +96,26 @@ def create_training_artifacts(model_path, artifacts_dir, model_prefix): 4. The checkpoint file """ - class MNISTWithLoss(onnxblock.TrainingModel): - def __init__(self): - super().__init__() - self.loss = onnxblock.loss.CrossEntropyLoss() - - def build(self, output_name): - return self.loss(output_name) - - mnist_with_loss = MNISTWithLoss() - onnx_model, eval_model, optimizer_model = onnx.load(model_path), None, None - - # Build the training and eval graphs - logging.info("Using onnxblock to create the training artifacts.") - with onnxblock.onnx_model(onnx_model) as model_accessor: - _ = mnist_with_loss(onnx_model.graph.output[0].name) - eval_model = model_accessor.eval_model - - # Build the optimizer graph - optimizer = onnxblock.optim.AdamW() - with onnxblock.onnx_model() as accessor: - _ = optimizer(mnist_with_loss.parameters()) - optimizer_model = accessor.model + onnx_model = onnx.load(model_path) + + requires_grad = [ + param.name + for param in onnx_model.graph.initializer + if (not param.name.endswith("_scale") and not param.name.endswith("_zero_point")) + ] + artifacts.generate_artifacts( + onnx_model, + requires_grad=requires_grad, + loss=artifacts.LossType.CrossEntropyLoss, + optimizer=artifacts.OptimType.AdamW, + artifact_directory=artifacts_dir, + prefix=model_prefix, + ) # Create the training artifacts - train_model_path = os.path.join(artifacts_dir, f"{model_prefix}_train.onnx") - logging.info(f"Saving the training model to {train_model_path}.") - onnx.save(onnx_model, train_model_path) - eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}_eval.onnx") - logging.info(f"Saving the eval model to {eval_model_path}.") - onnx.save(eval_model, eval_model_path) - optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}_optimizer.onnx") - logging.info(f"Saving the optimizer model to {optimizer_model_path}.") - onnx.save(optimizer_model, optimizer_model_path) - trainable_params, non_trainable_params = mnist_with_loss.parameters() - checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}_checkpoint.ckpt") - logging.info(f"Saving the checkpoint to {checkpoint_path}.") - onnxblock.save_checkpoint((trainable_params, non_trainable_params), checkpoint_path) + train_model_path = os.path.join(artifacts_dir, f"{model_prefix}training_model.onnx") + eval_model_path = os.path.join(artifacts_dir, f"{model_prefix}eval_model.onnx") + optimizer_model_path = os.path.join(artifacts_dir, f"{model_prefix}optimizer_model.onnx") + checkpoint_path = os.path.join(artifacts_dir, f"{model_prefix}checkpoint") return train_model_path, eval_model_path, optimizer_model_path, checkpoint_path diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py index 51a15475ee911..dcc9e116fda7d 100644 --- a/orttraining/orttraining/test/python/qat_poc_example/qat.py +++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py @@ -46,7 +46,7 @@ ) logging.info("Preparing the training artifacts for QAT.") - training_model_name = "mnist_qat" + training_model_name = "mnist_qat_" artifacts_dir = os.path.join(model_dir, "training_artifacts") utils.makedir(artifacts_dir) training_artifacts = create_training_artifacts( diff --git a/orttraining/orttraining/test/python/qat_poc_example/train.py b/orttraining/orttraining/test/python/qat_poc_example/train.py index 9a429d2adc6f1..a25c071c58a48 100644 --- a/orttraining/orttraining/test/python/qat_poc_example/train.py +++ b/orttraining/orttraining/test/python/qat_poc_example/train.py @@ -26,14 +26,10 @@ def _train_epoch(model, optimizer, train_loader): model.train() cumulative_loss = 0 for data, target in train_loader: - forward_inputs = [ - data.reshape(len(data), 784).numpy(), - target.numpy().astype(np.int32), - ] - train_loss = model(forward_inputs) + train_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64)) optimizer.step() model.lazy_reset_grad() - cumulative_loss += train_loss[0] + cumulative_loss += train_loss return cumulative_loss / len(train_loader) @@ -43,12 +39,8 @@ def _eval(model, test_loader): model.eval() cumulative_loss = 0 for data, target in test_loader: - forward_inputs = [ - data.reshape(len(data), 784).numpy(), - target.numpy().astype(np.int32), - ] - test_loss = model(forward_inputs) - cumulative_loss += test_loss[0] + test_loss = model(data.reshape(len(data), 784).numpy(), target.numpy().astype(np.int64)) + cumulative_loss += test_loss return cumulative_loss / len(test_loader) @@ -65,7 +57,7 @@ def train_model(qat_train_model, qat_eval_model, qat_optimizer_model, qat_checkp train_loader, test_loader = _get_dataloaders("data", batch_size) # Load the checkpoint state. - state = orttraining.CheckpointState(qat_checkpoint) + state = orttraining.CheckpointState.load_checkpoint(qat_checkpoint) # Create the training module. model = orttraining.Module(qat_train_model, state, qat_eval_model) From 8092a89688f92dee83d1d0111acaa1e1d2dfdb85 Mon Sep 17 00:00:00 2001 From: satyajandhyala Date: Tue, 20 Feb 2024 21:18:54 -0800 Subject: [PATCH 17/68] Changed command line argpasrse to process '--symmetric [True|False]'. (#19577) ### Description Accept the command line option --symmetric and its optional value correctly. If the optional value matches uncased to 'True' then set symmetric to True else set symmetric to False. Asymmetric quantization will generate zero_point input. ``` usage: matmul_4bits_quantizer.py [-h] --input_model INPUT_MODEL --output_model OUTPUT_MODEL [--block_size BLOCK_SIZE] [--symmetric [{True,False}]] [--accuracy_level ACCURACY_LEVEL] [-v] [--nodes_to_exclude NODES_TO_EXCLUDE [NODES_TO_EXCLUDE ...]] ``` ### Motivation and Context --- .../python/tools/quantization/matmul_4bits_quantizer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index 3e9f9a6544a71..eb7bbec997d59 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -349,6 +349,10 @@ def process(self): self.int4_quant_algo() +def ort_convert_str_to_bool(value): + return value.lower() in ("true", "1") + + def parse_args(): parser = argparse.ArgumentParser( description="""Blockwise int4 quantization for MatMul 2D weight matrices. @@ -366,7 +370,10 @@ def parse_args(): "--symmetric", required=False, default=True, - type=bool, + const=True, + nargs="?", + type=ort_convert_str_to_bool, + choices=[True, False], help="Indicate whether to quantize the model symmetrically", ) parser.add_argument( From 58f4921686bf0a5b0442fb6df92d1b1972a118cc Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 21 Feb 2024 00:31:06 -0800 Subject: [PATCH 18/68] [js] changes to allow Float16Array if any polyfill is available (#19305) ### Description This change adds only necessary code to enable ort-web works with any Float16Array polyfill. Unlike #19302, in this PR, ort-web does not include any specific polyfill; instead, it's user's choice for how to use a polyfill. ORT-web uses Float16Array if it's available; otherwise, fallback to use Uint16Array. ```js // case 1: user does not use polyfill: import * as ort from 'onnxruntime-web'; const myF16Data = new Uint16Array(...); // need to use Uint16Array const myF16tensor = new ort.Tensor('float16', myF16Data, dims); ``` ```js // case 2: user use polyfill: import * as ort from 'onnxruntime-web'; import { Float16Array, isFloat16Array, isTypedArray, getFloat16, setFloat16, f16round, } from "@petamoriken/float16"; globalThis.Float16Array = Float16Array; // ort-web will pick the global Float16Array const myF16Data = new Float16Array(...); // Use the polyfilled Float16Array type const myF16tensor = new ort.Tensor('float16', myF16Data, dims); ``` --- js/common/lib/tensor-impl-type-mapping.ts | 34 +++++++++++++++-------- js/common/lib/tensor-impl.ts | 10 ++++--- js/web/lib/wasm/wasm-common.ts | 9 +++++- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/js/common/lib/tensor-impl-type-mapping.ts b/js/common/lib/tensor-impl-type-mapping.ts index c4a43ea27fea1..b29cb8cbd6d35 100644 --- a/js/common/lib/tensor-impl-type-mapping.ts +++ b/js/common/lib/tensor-impl-type-mapping.ts @@ -14,7 +14,6 @@ export const NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP = new Map { - if (!isBigIntChecked) { - isBigIntChecked = true; - const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && typeof BigInt64Array.from === 'function'; - const isBigUint64ArrayAvailable = - typeof BigUint64Array !== 'undefined' && typeof BigUint64Array.from === 'function'; +// a dummy type declaration for Float16Array in case any polyfill is available. +declare global { + // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any + const Float16Array: any; +} + +// the following code allows delaying execution of BigInt/Float16Array checking. This allows lazy initialization for +// NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP and NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, which allows BigInt/Float16Array +// polyfill if available. +let isTypedArrayChecked = false; +export const checkTypedArray = () => { + if (!isTypedArrayChecked) { + isTypedArrayChecked = true; + const isBigInt64ArrayAvailable = typeof BigInt64Array !== 'undefined' && BigInt64Array.from; + const isBigUint64ArrayAvailable = typeof BigUint64Array !== 'undefined' && BigUint64Array.from; + const isFloat16ArrayAvailable = typeof Float16Array !== 'undefined' && Float16Array.from; if (isBigInt64ArrayAvailable) { NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('int64', BigInt64Array); @@ -53,5 +58,12 @@ export const checkBigInt = () => { NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('uint64', BigUint64Array); NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(BigUint64Array, 'uint64'); } + if (isFloat16ArrayAvailable) { + NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Float16Array); + NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP.set(Float16Array, 'float16'); + } else { + // if Float16Array is not available, use 'Uint16Array' to store the data. + NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP.set('float16', Uint16Array); + } } }; diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts index de18126a9d0ae..56682ef98e117 100644 --- a/js/common/lib/tensor-impl.ts +++ b/js/common/lib/tensor-impl.ts @@ -5,7 +5,7 @@ import {tensorToDataURL, tensorToImageData} from './tensor-conversion-impl.js'; import {TensorToDataUrlOptions, TensorToImageDataOptions} from './tensor-conversion.js'; import {tensorFromGpuBuffer, tensorFromImage, tensorFromPinnedBuffer, tensorFromTexture} from './tensor-factory-impl.js'; import {CpuPinnedConstructorParameters, GpuBufferConstructorParameters, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters} from './tensor-factory.js'; -import {checkBigInt, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js'; +import {checkTypedArray, NUMERIC_TENSOR_TYPE_TO_TYPEDARRAY_MAP, NUMERIC_TENSOR_TYPEDARRAY_TO_TYPE_MAP, SupportedTypedArray, SupportedTypedArrayConstructors} from './tensor-impl-type-mapping.js'; import {calculateSize, tensorReshape} from './tensor-utils-impl.js'; import {Tensor as TensorInterface} from './tensor.js'; @@ -67,8 +67,8 @@ export class Tensor implements TensorInterface { arg0: TensorType|TensorDataType|readonly string[]|readonly boolean[]|CpuPinnedConstructorParameters| TextureConstructorParameters|GpuBufferConstructorParameters, arg1?: TensorDataType|readonly number[]|readonly string[]|readonly boolean[], arg2?: readonly number[]) { - // perform one-time check for BigInt support - checkBigInt(); + // perform one-time check for BigInt/Float16Array support + checkTypedArray(); let type: TensorType; let dims: readonly number[]; @@ -142,7 +142,9 @@ export class Tensor implements TensorInterface { throw new TypeError(`Unsupported tensor type: ${arg0}.`); } if (Array.isArray(arg1)) { - if (arg0 === 'float16') { + if (arg0 === 'float16' && typedArrayConstructor === Uint16Array) { + // When no Float16Array polyfill is used, we cannot create 'float16' tensor from number array. + // // Throw error here because when user try to use number array as data, // e.g. new Tensor('float16', [1, 2, 3, 4], dims)), it will actually call // Uint16Array.from(arg1) which generates wrong data. diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts index 93910af1f1bf0..54eaf5e0c43cc 100644 --- a/js/web/lib/wasm/wasm-common.ts +++ b/js/web/lib/wasm/wasm-common.ts @@ -3,6 +3,12 @@ import {Tensor} from 'onnxruntime-common'; +// a dummy type declaration for Float16Array in case any polyfill is available. +declare global { + // eslint-disable-next-line @typescript-eslint/naming-convention, @typescript-eslint/no-explicit-any + const Float16Array: any; +} + // This file includes common definitions. They do NOT have dependency on the WebAssembly instance. /** @@ -117,7 +123,8 @@ export const tensorTypeToTypedArrayConstructor = (type: Tensor.Type): Float32Arr Uint8ArrayConstructor|Float64ArrayConstructor|Uint32ArrayConstructor|BigUint64ArrayConstructor => { switch (type) { case 'float16': - return Uint16Array; + // allow Float16Array polyfill. + return typeof Float16Array !== 'undefined' && Float16Array.from ? Float16Array : Uint16Array; case 'float32': return Float32Array; case 'uint8': From 57d6819212464f49b30db047528be0f409dadc67 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Thu, 22 Feb 2024 00:08:47 +0800 Subject: [PATCH 19/68] [js/web] Fix fused-conv is not included in npm test (#19581) BUG: https://github.com/microsoft/onnxruntime/issues/18855 ### Description ### Motivation and Context --- js/web/test/suite-test-list.jsonc | 1 + 1 file changed, 1 insertion(+) diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 1c61518ddcdd2..b43b1ac37e37d 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1354,6 +1354,7 @@ "expand.jsonc", "fast-gelu.jsonc", "floor.jsonc", + "fused-conv.jsonc", "gather-elements.jsonc", "gemm.jsonc", "global-average-pool.jsonc", From e5ce81ae847d0b347a3dfe95abfc9e407e2f0469 Mon Sep 17 00:00:00 2001 From: Adam Pocock Date: Wed, 21 Feb 2024 15:24:41 -0500 Subject: [PATCH 20/68] [java] Adding ML program flag for CoreML (#19551) ### Description Adds the new CoreML enum flags to enable ML Program support in Java. ### Motivation and Context Adds support for #19347 to the Java API. --- .../ai/onnxruntime/providers/CoreMLFlags.java | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java index eb124decf75f3..cec3fadf446ca 100644 --- a/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java +++ b/java/src/main/java/ai/onnxruntime/providers/CoreMLFlags.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. All rights reserved. * Licensed under the MIT License. */ package ai.onnxruntime.providers; @@ -14,7 +14,18 @@ public enum CoreMLFlags implements OrtFlags { /** Enables CoreML on subgraphs. */ ENABLE_ON_SUBGRAPH(2), // COREML_FLAG_ENABLE_ON_SUBGRAPH(0x002) /** Only enable usage of CoreML if the device has an Apple Neural Engine. */ - ONLY_ENABLE_DEVICE_WITH_ANE(4); // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004), + ONLY_ENABLE_DEVICE_WITH_ANE(4), // COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE(0x004) + /** + * Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also + * allow inputs with dynamic shapes. However, the performance may be negatively impacted if inputs + * have dynamic shapes. + */ + ONLY_ALLOW_STATIC_INPUT_SHAPES(8), // COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES(0x008) + /** + * Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or + * later. + */ + CREATE_MLPROGRAM(16); // COREML_FLAG_CREATE_MLPROGRAM(0x010) /** The native value of the enum. */ public final int value; From 3afb38cfb7d4263f262dea33bcfa16d35c67fede Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 21 Feb 2024 12:46:16 -0800 Subject: [PATCH 21/68] [CUDA] Add use_tf32 cuda provider option (for FP32 Conv) (#19426) Follow up of https://github.com/microsoft/onnxruntime/pull/19357 to apply the use_tf32 option on fp32 cuDNN convolution. When use_tf32 = 0, we will disable TF32 in cuDNN convolution for FP32 inputs. https://docs.nvidia.com/deeplearning/cudnn/api/cudnn-graph-library.html#cudnnmathtype-t **CUDNN_FMA_MATH** - Restricted to only kernels that use FMA instructions. - On pre-NVIDIA A100 GPU devices, CUDNN_DEFAULT_MATH and CUDNN_FMA_MATH have the same behavior: Tensor Core kernels will not be selected. - With NVIDIA Ampere architecture and CUDA toolkit 11, CUDNN_DEFAULT_MATH permits TF32 Tensor Core operation and CUDNN_FMA_MATH does not. - The TF32 behavior for CUDNN_DEFAULT_MATH and the other Tensor Core math types can be explicitly disabled by the environment variable NVIDIA_TF32_OVERRIDE=0. --- onnxruntime/core/providers/cuda/nn/conv.cc | 17 ++++++++++++++--- onnxruntime/core/providers/cuda/nn/conv.h | 3 ++- .../core/providers/cuda/nn/conv_transpose.cc | 10 ++++++++-- .../training_ops/cuda/nn/conv_grad.cc | 3 ++- .../training_ops/cuda/nn/conv_shared.cc | 6 ++++-- .../training_ops/cuda/nn/conv_shared.h | 2 +- .../training_ops/cuda/nn/conv_transpose_grad.cc | 6 ++++-- 7 files changed, 35 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 82f3503919237..a417be5a86c32 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -326,7 +326,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, gsl::narrow_cast(conv_attrs_.group), - CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType())); + CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType(), + UseTF32())); if (context->InputCount() >= 3) { const Tensor* B = context->Input(2); @@ -351,8 +352,13 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { // set math type to tensor core before algorithm search - if constexpr (std::is_same::value) + if constexpr (std::is_same::value) { CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); + } else if constexpr (std::is_same::value) { + if (!UseTF32()) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH)); + } + } cudnnConvolutionFwdAlgoPerf_t perf; int algo_count = 1; @@ -399,6 +405,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) CUDNN_RETURN_IF_ERROR(GetWorkspaceSize(GetCudnnHandle(context), s_, perf.algo, &perf.memory)); if (std::is_same::value) { perf.mathType = CUDNN_TENSOR_OP_MATH; + } else if (std::is_same::value && !UseTF32()) { + perf.mathType = CUDNN_FMA_MATH; } else { perf.mathType = CUDNN_DEFAULT_MATH; } @@ -480,7 +488,8 @@ Status CudnnConvolutionDescriptor::Set( const gsl::span& dilations, int groups, cudnnConvolutionMode_t mode, - cudnnDataType_t data_type) { + cudnnDataType_t data_type, + bool use_tf32) { if (!desc_) CUDNN_RETURN_IF_ERROR(cudnnCreateConvolutionDescriptor(&desc_)); @@ -513,6 +522,8 @@ Status CudnnConvolutionDescriptor::Set( CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_DEFAULT_MATH)); if (data_type == CUDNN_DATA_HALF) { CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_TENSOR_OP_MATH)); + } else if (data_type == CUDNN_DATA_FLOAT && !use_tf32) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(desc_, CUDNN_FMA_MATH)); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index bcaa4d855b81e..181fbc99fd8e9 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -29,7 +29,8 @@ class CudnnConvolutionDescriptor final { const gsl::span& dilations, int groups, cudnnConvolutionMode_t mode, - cudnnDataType_t data_type); + cudnnDataType_t data_type, + bool use_tf32); operator cudnnConvolutionDescriptor_t() const { return desc_; } diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index 55dceaa2698e8..939b9959af818 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -167,7 +167,8 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; ORT_RETURN_IF_ERROR(s_.conv_desc.Set(p.kernel_shape.size(), p.pads, p.strides, p.dilations, gsl::narrow_cast(conv_transpose_attrs_.group), mode, - CudnnTensor::GetDataType())); + CudnnTensor::GetDataType(), + UseTF32())); if (has_bias) { const auto& b_shape = p.B->Shape(); @@ -187,8 +188,13 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy GetScratchBuffer(AlgoSearchWorkspaceSize, context->GetComputeStream()); // set math type to tensor core before algorithm search - if constexpr (std::is_same::value) + if constexpr (std::is_same::value) { CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_TENSOR_OP_MATH)); + } else if constexpr (std::is_same::value) { + if (!UseTF32()) { + CUDNN_RETURN_IF_ERROR(cudnnSetConvolutionMathType(s_.conv_desc, CUDNN_FMA_MATH)); + } + } cudnnConvolutionBwdDataAlgoPerf_t perf; int algo_count = 1; diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc index f6c58445c0a5d..fc5d9b65d0f89 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc @@ -114,7 +114,8 @@ Status ConvGrad::PrepareArgs(const Tensor& x, const Tensor& dY, const Tensor& ORT_RETURN_IF_ERROR(args_.y_tensor.Set(dy_dims, args_.params.data_type)); ORT_RETURN_IF_ERROR(args_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, gsl::narrow_cast(conv_attrs_.group), CUDNN_CROSS_CORRELATION, - args_.params.data_type)); + args_.params.data_type, + UseTF32())); if (dB) { const TensorShape& db_shape = dB->Shape(); diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc index 5dc16c68f6210..d23905496c9bb 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc @@ -233,11 +233,13 @@ bool ConvParamsEqual::operator()(const ConvParams& a, const ConvParams& b) const } template -Status AlgoIterator::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector& perf_results) { +Status AlgoIterator::OnlyDefaultAlgorithm(const ConvArgs& args, std::vector& perf_results, bool use_tf32) { perf_results.resize(1); perf_results[0].algo = AlgoSearch::DEFAULT_ALGO; if (args.params.data_type == CUDNN_DATA_HALF) { perf_results[0].mathType = CUDNN_TENSOR_OP_MATH; + } else if (args.params.data_type == CUDNN_DATA_FLOAT && !use_tf32) { + perf_results[0].mathType = CUDNN_FMA_MATH; } else { perf_results[0].mathType = CUDNN_DEFAULT_MATH; } @@ -256,7 +258,7 @@ Status AlgoIterator::TryAll(const CUDAExecutionProvider* provider, const std::vector perf_results; ORT_RETURN_IF_ERROR(args_.params.algo_mode == OrtCudnnConvAlgoSearchDefault - ? OnlyDefaultAlgorithm(args_, perf_results) + ? OnlyDefaultAlgorithm(args_, perf_results, provider->UseTF32()) : AlgoSearch::FindAlgorithms(args_, provider, allocator, perf_results)); for (auto& algo_perf : perf_results) { if (f(algo_perf) == Status::OK()) { diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h index a2d4bf3bdc006..3fdb4306bfbbb 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.h @@ -75,7 +75,7 @@ class AlgoIterator { Status TryAll(const CUDAExecutionProvider* provider, const AllocatorPtr& allocator, std::function f); - static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector& perf_results); + static Status OnlyDefaultAlgorithm(const ConvArgs& args, std::vector& perf_results, bool use_tf32); private: const ConvArgs& args_; diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc index 5f7206fc121ec..d3f5a89434a48 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_transpose_grad.cc @@ -182,7 +182,8 @@ Status ConvTransposeGrad::PrepareConvForwardArgs(const Tensor& X, const Tenso ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type)); ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, gsl::narrow_cast(conv_attrs_.group), CUDNN_CROSS_CORRELATION, - args.params.data_type)); + args.params.data_type, + UseTF32())); } return Status::OK(); @@ -287,7 +288,8 @@ Status ConvTransposeGrad::PrepareConvBackwardFilterArgs(const Tensor& X, cons ORT_RETURN_IF_ERROR(args.y_tensor.Set(y_dims, args.params.data_type)); ORT_RETURN_IF_ERROR(args.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, gsl::narrow_cast(conv_attrs_.group), CUDNN_CROSS_CORRELATION, - args.params.data_type)); + args.params.data_type, + UseTF32())); if (dB) { const auto& b_shape = dB->Shape(); From ebd220b0730f9898aaa0275ef0d8195ce70057d0 Mon Sep 17 00:00:00 2001 From: Matttttt <18152455+martholomew@users.noreply.github.com> Date: Wed, 21 Feb 2024 21:38:18 +0000 Subject: [PATCH 22/68] Misspelling in README.md (#19433) Fixed a misspelling. --- js/web/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/js/web/README.md b/js/web/README.md index c75a40ad6da28..906c78a1b7ec4 100644 --- a/js/web/README.md +++ b/js/web/README.md @@ -12,7 +12,7 @@ The [Open Neural Network Exchange](http://onnx.ai/) (ONNX) is an open standard f With ONNX Runtime Web, web developers can score models directly on browsers with various benefits including reducing server-client communication and protecting user privacy, as well as offering install-free and cross-platform in-browser ML experience. -ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend. +ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web compiles the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend. See [Compatibility](#Compatibility) and [Operators Supported](#Operators) for a list of platforms and operators ONNX Runtime Web currently supports. @@ -22,7 +22,7 @@ Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxrun ## Documents -### Developement +### Development Refer to the following links for development information: From 38c34323939bac03b9648b2e59dbbe8de0bd7092 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 21 Feb 2024 13:58:53 -0800 Subject: [PATCH 23/68] Bump ip from 1.1.8 to 1.1.9 in /js/react_native (#19582) Bumps [ip](https://github.com/indutny/node-ip) from 1.1.8 to 1.1.9.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ip&package-manager=npm_and_yarn&previous-version=1.1.8&new-version=1.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) Dependabot will merge this PR once CI passes on it, as requested by @fs-eire. [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/react_native/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock index 4dca90d7415cf..bbb0c4f3d1e22 100644 --- a/js/react_native/yarn.lock +++ b/js/react_native/yarn.lock @@ -3701,9 +3701,9 @@ invariant@^2.2.4: loose-envify "^1.0.0" ip@^1.1.5: - version "1.1.8" - resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48" - integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg== + version "1.1.9" + resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396" + integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ== is-absolute@^1.0.0: version "1.0.0" From 5197db19802a39e47d19ac829cd08a94bacbdfbb Mon Sep 17 00:00:00 2001 From: Sheil Kumar Date: Wed, 21 Feb 2024 15:45:44 -0800 Subject: [PATCH 24/68] Diable __cpuid call for ARM64EC (#19592) Diable __cpuid call for ARM64EC Co-authored-by: Sheil Kumar --- winml/lib/Api/HardwareCoreEnumerator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp index b6b44690f4f6c..d04e276347170 100644 --- a/winml/lib/Api/HardwareCoreEnumerator.cpp +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -84,7 +84,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. auto cores = GetNumberOPhysicalAndEngineeringCores(); -#if !defined(_M_ARM64) && !defined(__aarch64__) +#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__) const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69}; // "GenuntelineI" int regs_leaf0[4]; int regs_leaf7[4]; From 3d88487c96bf467c4b83dff179c9e282602e2d64 Mon Sep 17 00:00:00 2001 From: Vincent Wang Date: Thu, 22 Feb 2024 10:35:26 +0800 Subject: [PATCH 25/68] Minor Triton Fix (#19589) Including removing a unnecessary assert, and add support of passing string attribute from ONNX node attribute to python functoin kwargs (mainly for passing debug info from graph to python for now). --- .../orttraining/core/framework/triton/triton_op_executor.cc | 2 ++ orttraining/orttraining/python/training/ort_triton/_utils.py | 3 ++- orttraining/orttraining/training_ops/cpu/triton/triton_op.h | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc index 092ab89d5d760..f30d6ddee253a 100644 --- a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc +++ b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc @@ -106,6 +106,8 @@ void TritonOpExecutor::ExecuteByFuncName(const std::string& func_name, const Inl PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyLong_FromLongLong(std::stoll(kv.second.first))); } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyFloat_FromDouble(std::stod(kv.second.first))); + } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_STRING) { + PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyUnicode_FromString(kv.second.first.c_str())); } else { ORT_THROW("Unsupported kwargs data type: ", kv.second.second); } diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py index 95e6703be8783..877eacc0b775f 100644 --- a/orttraining/orttraining/python/training/ort_triton/_utils.py +++ b/orttraining/orttraining/python/training/ort_triton/_utils.py @@ -141,13 +141,14 @@ def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> Tupl def next_power_of_2(n: int) -> int: - assert n <= 2**32, "32-bit only" + """Return the smallest power of 2 greater than or equal to n""" n -= 1 n |= n >> 1 n |= n >> 2 n |= n >> 4 n |= n >> 8 n |= n >> 16 + n |= n >> 32 n += 1 return n diff --git a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h index f226db76f7ed7..db8e8558ab884 100644 --- a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h +++ b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h @@ -25,12 +25,15 @@ class TritonOp final : public OpKernel { attr.first == "onnx_string") { continue; } - // Support int64 and float only for now, skip other types. + // Support int64, float and string only for now, skip other types. if (attr.second.type() == ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_INT) { kwargs_.insert({attr.first, {std::to_string(attr.second.i()), ONNX_NAMESPACE::TensorProto_DataType_INT64}}); } else if (attr.second.type() == ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_FLOAT) { kwargs_.insert({attr.first, {std::to_string(attr.second.f()), ONNX_NAMESPACE::TensorProto_DataType_FLOAT}}); + } else if (attr.second.type() == + ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_STRING) { + kwargs_.insert({attr.first, {attr.second.s(), ONNX_NAMESPACE::TensorProto_DataType_STRING}}); } } } From 8354329086ebb190db9ea0cb6a3fa72f53f8f881 Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:34:45 +0800 Subject: [PATCH 26/68] [ROCm] SkipGroupNorm triton (#19408) Change GroupNorm triton to support SkipGroupNorm --- .../rocm/diffusion/group_norm_triton.cuh | 23 ++++++++--- .../rocm/diffusion/group_norm_triton.py | 39 +++++++++++++++++-- .../kernel_explorer/kernels/groupnorm_test.py | 12 ++++++ 3 files changed, 64 insertions(+), 10 deletions(-) diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh index b3d3e92209b39..c6ca16bfdfc80 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh @@ -46,8 +46,6 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() { auto block_size = metadata->constants.at("BLOCK_SIZE"); auto hw_size = metadata->constants.at("HW_SIZE"); auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams* params) -> Status { - TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr), - "Input skip or bias is not supported by triton kernel."); TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF( params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size, "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (", @@ -61,23 +59,36 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() { } // Construct args for launch kernel struct { - void* X; - void* Y; + const void* src; + const void* skip; + const void* bias; + void* out; + void* add_out; const void* gamma; const void* beta; int hw; int c; int c_per_group; float eps; + bool has_skip; + bool has_bias; + bool broadcast_skip; } args = { - (void*)params->src, + (const void*)params->src, + (const void*)params->skip, + (const void*)params->bias, (void*)params->dst, + (void*)params->skip_workspace, (const void*)params->gamma, (const void*)params->beta, params->hw, params->c, params->channels_per_group, - params->epsilon}; + params->epsilon, + params->skip != nullptr, + params->bias != nullptr, + params->broadcast_skip, + }; // Grid dim is (batch_count, groups, 1) return LaunchTritonKernel(params->StreamHandle(), i, params->n, params->groups, 1, &args, sizeof(args)); diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py index 5368cb1cf635b..5ba96ebc117f0 100644 --- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py +++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py @@ -12,13 +12,19 @@ @triton.jit def group_norm_kernel( input_ptr, + skip_ptr, + bias_ptr, output_ptr, + add_out_ptr, gamma_ptr, beta_ptr, img_size, c, c_per_group, eps, + has_skip, + has_bias, + broadcast_skip, BLOCK_SIZE: tl.constexpr, HW_SIZE: tl.constexpr, ACTIVATION_SILU: tl.constexpr, @@ -36,14 +42,35 @@ def group_norm_kernel( offsets = hw[:, None] * c + cols[None, :] mask = (cols < c_per_group)[None, :] + bias = tl.zeros([BLOCK_SIZE], dtype=tl.float32) + if has_skip: + add_out_ptr += row_x * stride + row_y * c_per_group + if broadcast_skip: + broadcast_skip_ptr = skip_ptr + row_x * c + row_y * c_per_group + bias += tl.load(broadcast_skip_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32) + else: + skip_ptr += row_x * stride + row_y * c_per_group + if has_bias: + bias_ptr += row_y * c_per_group + bias += tl.load(bias_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32) + # Calculate mean and variance _sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32) _square_sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32) for i in range(tl.cdiv(img_size, HW_SIZE)): x_ptr = input_ptr + i * HW_SIZE * c a = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + if has_skip and not broadcast_skip: + s_ptr = skip_ptr + i * HW_SIZE * c + s = tl.load(s_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + a += s + if has_bias or broadcast_skip: + a += bias _sum += a _square_sum += a * a + if has_skip: + add_y_ptr = add_out_ptr + i * HW_SIZE * c + tl.store(add_y_ptr + offsets, a, mask=mask) # Set axis=None (or leave it unspecified) to reduce all axes. # TODO: In older Triton we have to reduce an axis at a time, but in our case @@ -57,9 +84,13 @@ def group_norm_kernel( gamma = tl.load(gamma_ptr + cols, mask=cols < c_per_group).to(tl.float32) beta = tl.load(beta_ptr + cols, mask=cols < c_per_group).to(tl.float32) for i in range(tl.cdiv(img_size, HW_SIZE)): - x_ptr = input_ptr + i * HW_SIZE * c y_ptr = output_ptr + i * HW_SIZE * c - x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + if has_skip: + add_y_ptr = add_out_ptr + i * HW_SIZE * c + x = tl.load(add_y_ptr + offsets, mask=mask, other=0.0).to(tl.float32) + else: + x_ptr = input_ptr + i * HW_SIZE * c + x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32) x_hat = (x - group_mean) * rstd y = x_hat * gamma + beta if ACTIVATION_SILU: @@ -77,7 +108,7 @@ def group_norm_kernel( hw_sizes = [8, 16, 32, 64, 128, 256] warps = [1, 2, 4, 8, 16] name_pattern = "GroupNormTriton_{}_{}_b{}_hw{}_w{}" -sig_pattern = "*{},*{},*fp32,*fp32,i32,i32,i32,fp32" +sig_pattern = "*{},*{},*{},*{},*{},*fp32,*fp32,i32,i32,i32,fp32,i1,i1,i1" group_pattern = "GroupNormTriton_{}_{}" @@ -88,7 +119,7 @@ def get_function_table(): silu_suffix = "Silu" if silu else "Pass" name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp) group = group_pattern.format(silu_suffix, dtype) - sig = sig_pattern.format(dtype, dtype) + sig = sig_pattern.format(dtype, dtype, dtype, dtype, dtype) kwargs = { "num_warps": warp, "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)}, diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py index 8334d20e47c86..400a9d8a7a187 100644 --- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py +++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py @@ -80,6 +80,18 @@ def run_group_norm( ) use_silu = silu broadcast_skip = False + if has_skip: + skip_x_shape = skip_x.shape + b2 = len(skip_x_shape) == 2 and skip_x_shape[0] == batch_size and skip_x_shape[1] == num_channels + b4 = ( + len(skip_x_shape) == 4 + and skip_x_shape[0] == batch_size + and skip_x_shape[1] == 1 + and skip_x_shape[2] == 1 + and skip_x_shape[3] == num_channels + ) + if b2 or b4: + broadcast_skip = True channels_per_block = 0 # Compute in params initialization input_d = ke.DeviceArray(input_x.astype(dtype)) From 05ed89f46980b7e5a5328bc20af8b32ca9f1f715 Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:34:55 +0800 Subject: [PATCH 27/68] [ROCm] Add excluded libs for ROCm python package (#19586) The rocm lib version has changed in rocm 6.0 Using libs packaged in whl might cause errors. For example, `libamdhip64.so.6` packaged in whl will cause compute error when training gpt2 model. The root cause still in investigating. --- setup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.py b/setup.py index 03e1cb75ba581..9a5fc29dd5e02 100644 --- a/setup.py +++ b/setup.py @@ -205,18 +205,23 @@ def run(self): rocm_dependencies = [ "libamd_comgr.so.2", "libamdhip64.so.5", + "libamdhip64.so.6", "libdrm.so.2", "libdrm_amdgpu.so.1", "libelf.so.1", "libhipfft.so.0", "libhiprtc.so.5", + "libhiprtc.so.6", "libhsa-runtime64.so.1", "libMIOpen.so.1", "libnuma.so.1", "librccl.so.1", "librocblas.so.3", + "librocblas.so.4", "librocfft.so.0", + "libroctx64.so.4", "librocm_smi64.so.5", + "librocm_smi64.so.6", "libroctracer64.so.4", "libtinfo.so.6", "libmigraphx_c.so.3", From 6b73ab3e3e72a9f2008e8d0e221b0be77d2993b1 Mon Sep 17 00:00:00 2001 From: cao lei Date: Thu, 22 Feb 2024 10:19:08 -0800 Subject: [PATCH 28/68] Introduce reused_buffer_index_per_stream in allocation planner which will be reset after computing the reuse buffer for each stream (#19515) ### Description Introduce reused_buffer_index_per_stream in allocation planner which will be reset after computing the reuse buffer for each stream. So if a NodeArg is an input of several Ops across different streams and reuses other NodeArg, the reused NodeArg won't be involved when computing the second stream's reuse plan. ### Motivation and Context This is to fix https://github.com/microsoft/onnxruntime/issues/19480, which is a crash for the scenario mentioned above. --------- Co-authored-by: Lei Cao --- .../core/framework/allocation_planner.cc | 44 ++++++------ .../test/framework/allocation_planner_test.cc | 68 ++++++++++++++++++ .../multi_stream_models/issue_19480.onnx | Bin 0 -> 760 bytes 3 files changed, 91 insertions(+), 21 deletions(-) create mode 100644 onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index ea7a6432a7507..158ab8ed610f4 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -182,7 +182,6 @@ class PlannerImpl { // upstream_node_0 and upstream_node_1 are the immmediate upstream nodes of downstream_node // upstream_node_2 is the immediate nodes ahead of downstream_node in the same logic stream InlinedHashMap> dependence_graph_; - InlinedHashMap> value_consumer_map_; InlinedHashMap value_node_map_; // OrtValueInfo: Auxiliary information about an OrtValue used only during plan-generation: @@ -295,7 +294,7 @@ class PlannerImpl { } #endif - // Find if there exists some input tensor that we can use in-place for output_arg_num-th input in the node. + // Find if there exists some input tensor that we can use in-place for output_arg_num-th output in the node. bool FindReusableInput(const onnxruntime::Node& node, int output_arg_num, OrtValueIndex* reusable_input, bool* is_strided_tensor) { *is_strided_tensor = false; @@ -530,6 +529,7 @@ class PlannerImpl { // Initialize allocation plan: plan_.allocation_plan.resize(num_ml_values); + for (int i = 0; static_cast(i) < num_ml_values; i++) AllocPlan(i).reused_buffer = i; } bool HasExternalOutputs(const Node& node) const { @@ -1065,7 +1065,8 @@ class PlannerImpl { // build the consumer list for each value int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1; - value_consumer_map_.reserve(num_ml_values); + InlinedHashMap> value_consumer_map; + value_consumer_map.reserve(num_ml_values); // iterate each stream from back, so the first element is the last consumer in single stream case for (auto& stream : stream_nodes_) { @@ -1078,10 +1079,10 @@ class PlannerImpl { const auto& name = input.Name(); int value_idx; ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx)); - auto origin = Buffer(value_idx); - if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) { + auto origin = AllocPlan(value_idx).reused_buffer; + if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) { // add current node as consumer for origin buffer - value_consumer_map_[origin].insert(node_index); + value_consumer_map[origin].insert(node_index); } } return Status::OK(); @@ -1138,8 +1139,8 @@ class PlannerImpl { std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl; allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = reusable_input; - value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(), - value_consumer_map_[output_idx_global].end()); + value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(), + value_consumer_map[output_idx_global].end()); reused.insert(reusable_input); found_reusable = true; break; @@ -1168,8 +1169,8 @@ class PlannerImpl { allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) { allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = reusable_input; - value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(), - value_consumer_map_[output_idx_global].end()); + value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(), + value_consumer_map[output_idx_global].end()); reused.insert(reusable_input); continue; } // if @@ -1187,11 +1188,11 @@ class PlannerImpl { OrtValueIndex input_arg_index{}; if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() && allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) { - if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) { + if (value_consumer_map[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) { allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse; allocation_plan[output_idx_global].reused_buffer = input_arg_index; - value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(), - value_consumer_map_[output_idx_global].end()); + value_consumer_map[input_arg_index].insert(value_consumer_map[output_idx_global].begin(), + value_consumer_map[output_idx_global].end()); reused.insert(input_arg_index); } } @@ -1266,7 +1267,7 @@ class PlannerImpl { } bool all_covered = true; - for (auto consumer : value_consumer_map_[output_idx_global]) { + for (auto consumer : value_consumer_map[output_idx_global]) { if (deps->find(consumer) == deps->end()) { all_covered = false; break; @@ -1277,9 +1278,9 @@ class PlannerImpl { allocation_plan[downstream_value].reused_buffer = output_idx_global; get_reused = true; // add new consumer for the value to be reused - value_consumer_map_[output_idx_global].insert(value_node_map_[downstream_value]); - value_consumer_map_[output_idx_global].insert(value_consumer_map_[downstream_value].begin(), - value_consumer_map_[downstream_value].end()); + value_consumer_map[output_idx_global].insert(value_node_map_[downstream_value]); + value_consumer_map[output_idx_global].insert(value_consumer_map[downstream_value].begin(), + value_consumer_map[downstream_value].end()); node_iter = size_iter->second.erase(node_iter); if (size_iter->second.empty()) { local_iter->second.erase(size_iter); @@ -1342,8 +1343,9 @@ class PlannerImpl { ort_value_usecount.reserve(ort_value_info_.size()); #endif for (size_t i = 0; i < stream_nodes_.size(); ++i) { - // compute use count first + // compute use count first. TODO(leca): call ComputeReuseCount() only once is enough! ORT_RETURN_IF_ERROR(ComputeReuseCount()); + for (int j = 0; static_cast(j) < ort_value_info_.size(); j++) Buffer(j) = j; #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) if (i == 0) { for (auto ort_value_info : ort_value_info_) { @@ -1693,8 +1695,8 @@ class PlannerImpl { const auto& name = input.Name(); int value_idx; ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx)); - auto origin = Buffer(value_idx); - if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) { + auto origin = AllocPlan(value_idx).reused_buffer; + if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) { // add current node as consumer for origin buffer value_consumers[origin].push_back(node_index); } @@ -1889,7 +1891,7 @@ class PlannerImpl { // 2. the consumer is in the same stream(non-cpu device), but it consumes a CPU tensor from an non-shape op. // for example, a resize cuda kernel consumer a tensor from MemCpyToHost cuda kernel on the same stream. // in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching - OrtDevice::DeviceType output_arg_device = plan_.allocation_plan[output_arg_idx].location.Type(); + OrtDevice::DeviceType output_arg_device = AllocPlan(output_arg_idx).location.Type(); WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, output_arg_device); if ((node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) { if (node_to_notification.find(node_index) == node_to_notification.end()) { diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index d7b1de5c930c5..3e0d94e94e48c 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -1974,6 +1974,74 @@ TEST_F(PlannerTest, TestCpuIf) { ASSERT_TRUE(exe_plan[1]->steps_[6]->ToString().substr(0, WaitOnEPStep.size()) == WaitOnEPStep); } } + +// model looks like: +// |-----------> Gather +// |-----------> Gather +// |-----------> Gather +// |-----------> Gather +// Shape ----------------> Reshape --> Shape ------------------> Reshape +// ^ ^ +// InstanceNormalization ----| InstanceNormalization ------| +// +// Python script to create this model: +// def CreateModelFor19480(): +// #shape->reshape->shape->reshape, 4 gather +// graphNodes = [] +// graphNodes.append(h.make_node('Shape', inputs=['shape_input'], outputs=['9'])) +// graphNodes.append(h.make_node('InstanceNormalization', inputs=['in0_input', 'scale0', 'B0'], outputs=['8'])) +// graphNodes.append(h.make_node('Reshape', inputs=['8', '9'], outputs=['Reshape15_output'])) +// graphNodes.append(h.make_node('Shape', inputs=['Reshape15_output'], outputs=['281'])) +// graphNodes.append(h.make_node('InstanceNormalization', inputs=['in1_input', 'scale1', 'B1'], outputs=['293'])) +// graphNodes.append(h.make_node('Reshape', inputs=['293', '281'], outputs=['output0'])) +// graphNodes.append(h.make_node('Gather', inputs=['281', 'indices1'], outputs=['output1'])) +// graphNodes.append(h.make_node('Gather', inputs=['281', 'indices2'], outputs=['output2'])) +// graphNodes.append(h.make_node('Gather', inputs=['281', 'indices3'], outputs=['output3'])) +// graphNodes.append(h.make_node('Gather', inputs=['281', 'indices4'], outputs=['output4'])) +// g = h.make_graph(graphNodes, 'issue_19480', +// [h.make_tensor_value_info('shape_input', tp.FLOAT, ['batch', 128, None, None]), +// h.make_tensor_value_info('in0_input', tp.FLOAT, ['batch', 32, None]), +// h.make_tensor_value_info('scale0', tp.FLOAT, [32]), +// h.make_tensor_value_info('B0', tp.FLOAT, [32]), +// h.make_tensor_value_info('in1_input', tp.FLOAT, ['batch', 32, None]), +// h.make_tensor_value_info('scale1', tp.FLOAT, [32]), +// h.make_tensor_value_info('B1', tp.FLOAT, [32]), +// h.make_tensor_value_info('indices1', tp.INT32, []), +// h.make_tensor_value_info('indices2', tp.INT32, []), +// h.make_tensor_value_info('indices3', tp.INT32, []), +// h.make_tensor_value_info('indices4', tp.INT32, [])], +// [h.make_tensor_value_info('output0', tp.FLOAT, None), +// h.make_tensor_value_info('output1', tp.INT64, None), +// h.make_tensor_value_info('output2', tp.INT64, None), +// h.make_tensor_value_info('output3', tp.INT64, None), +// h.make_tensor_value_info('output4', tp.INT64, None)]) +// model = h.make_model(g, opset_imports=[h.make_operatorsetid("", 17)], producer_name='producer_name') +// onnx.save(model, 'issue_19480.onnx') +// +TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) { + SessionOptions sess_opt; + sess_opt.graph_optimization_level = TransformerLevel::Default; + + InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx")); + auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()); + status = sess.Load(); + status = sess.Initialize(); + ASSERT_TRUE(status.IsOK()) << "No crash"; + const SequentialExecutionPlan* plan = sess.GetSessionState().GetExecutionPlan(); + ASSERT_EQ(plan->allocation_plan[14].alloc_kind, AllocKind::kReuse) << "The input of reshape and gather will reuse the output of shape"; + + int gather_count = 0; + for (size_t i = 0; i < plan->execution_plan[1]->steps_.size(); i++) { + if (strstr(typeid(*(plan->execution_plan[1]->steps_[i])).name(), "LaunchKernelStep")) { + const Node* node = sess.GetSessionState().GetGraphViewer().GetNode(plan->execution_plan[1]->steps_[i]->GetNodeIndex()); + if (node->OpType() == "Gather") + gather_count++; + else + FAIL() << "CPU stream should contain only gather ops"; + } + } + ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream"; +} #endif } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx new file mode 100644 index 0000000000000000000000000000000000000000..dc7d39206dd49f4ef6daf65b7d58c5b456ecf331 GIT binary patch literal 760 zcmaixKTm@|7>Bw3f%9#m_0_6_F_p#1gab^#v5RqW(2b?J(o1>?g{IKO$uH`6@t{2^ z#m0q%=Y9D7j(aJ^(>&%f;j=_M&c!l&{_evy4DtnEiK$Fin*vE__dm*aU~nQ+XN$p9 zA11aA3GP#64zs z+VGAUzBYVq;6Ud2Mod}g2Tt_Ry!IQoq685v?9X@+FQ7}m2pC{Q_TCzB1Q$v>tF;at zE9X-02LY%OdZ2hTthTjJs;u4R{+GpCSxtg_7ivO}nrK8dbFt05KbWuCO#ReubJg)l W4Oj)N8n}nRI|Tj~OnP7p&wl_GGP8{U literal 0 HcmV?d00001 From 3bdb10d5ca4f258ec444863bcd5e839eeac5c238 Mon Sep 17 00:00:00 2001 From: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Date: Thu, 22 Feb 2024 10:56:25 -0800 Subject: [PATCH 29/68] Move import to when needed to avoid circular dependency error (#19579) ### Description Move import to when needed to avoid circular dependency error ### Motivation and Context Fixes dependency error described here: https://github.com/microsoft/DeepSpeed/issues/5140 --------- Co-authored-by: Thiago Crepaldi --- .../python/training/ortmodule/_graph_execution_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 779b6bfe50422..fda6e345da235 100755 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -20,7 +20,6 @@ from onnxruntime.capi import _pybind_state as C from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference from onnxruntime.training.utils import ORTModelInputOutputSchemaType, PTable, onnx_dtype_to_pytorch_dtype -from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils from ._fallback import ( @@ -143,6 +142,9 @@ def __init__( self._zero_stage3_param_map = {} if self._runtime_options.enable_zero_stage3_support: + # Move import to here to avoid circular dependency error + from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3 # type: ignore[import] + # Cannot toggle feature enabling/disabling after the first time enabled. configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True) From fe82fccf1a4d7ea6c24c8448d7264df36605c370 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Fri, 23 Feb 2024 05:09:28 +0800 Subject: [PATCH 30/68] [js/webgpu] Fix Conv2DTransposeMatMul f16 compilation failure (#19596) This is used in sam-h-decoder-f16. ### Description ### Motivation and Context --- .../ops/3rd-party/conv_backprop_mm_webgpu.ts | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts index b5b6a2a15cd8c..11c8778b72335 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts @@ -23,17 +23,17 @@ import {DataType} from '../../../../wasm-common'; import {LOG_DEBUG} from '../../../log'; import {TensorView} from '../../../tensor-view'; import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types'; -import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from '../common'; +import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common'; import {ConvTransposeAttributes} from '../conv-transpose'; import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils'; -import {biasSnippet, typeSnippet} from './activation_util'; +import {biasSnippet} from './activation_util'; import {utilFunctions} from './conv_util'; import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packed_webgpu'; const conv2dTransposeCommonSnippet = - (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, innerElementSize = 4): string => { - const type = typeSnippet(innerElementSize, 'f32'); + (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, type: string, + innerElementSize = 4): string => { const getWSnippet = (innerElementSize: number) => { switch (innerElementSize) { case 1: @@ -47,7 +47,7 @@ const conv2dTransposeCommonSnippet = let v1 = w[getIndexFromCoords4D(coord1, vec4(uniforms.w_shape))]; let v2 = w[getIndexFromCoords4D(coord2, vec4(uniforms.w_shape))]; let v3 = w[getIndexFromCoords4D(coord3, vec4(uniforms.w_shape))]; - return vec4(v0, v1, v2, v3); + return ${type}(v0, v1, v2, v3); `; default: throw new Error(`innerElementSize ${innerElementSize} is not supported.`); @@ -224,7 +224,7 @@ export const createConv2DTransposeMatMulProgramInfo = const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components); inputVariables.push(bias); declareFunctions += ` - fn getBiasByOutputCoords(coords : vec4) -> ${isVec4 ? 'vec4' : 'f32'} { + fn getBiasByOutputCoords(coords : vec4) -> ${bias.type.value} { return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}]; }`; } @@ -236,16 +236,20 @@ export const createConv2DTransposeMatMulProgramInfo = {name: 'pads', type: 'i32', length: pads.length} ]; appendActivationUniforms(attributes, uniforms); + const elemType = tensorTypeToWsglStorageType(inputs[0].dataType, 1); + if (elemType !== 'f16' && elemType !== 'f32') { + throw new Error(`elemType ${elemType} is not supported.`); + } return ` ${utilFunctions('uniforms.result_strides')} ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)}; ${declareFunctions} - ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, innerElementSize)} + ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, x.type.value, innerElementSize)} ${ isVec4 ? makeMatMulPackedVec4Source( - elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner) : + elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner) : makeMatMulPackedSource( - elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner, false, + elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner, false, undefined, sequentialAccessByThreads)}`; }; From 09622418c45b265977a8f1f17581e15719357423 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Thu, 22 Feb 2024 13:15:13 -0800 Subject: [PATCH 31/68] Add special handling if there is only 1 graph inside the cached QNN context binary (#19594) Add special handling if there is only 1 graph inside the cached QNN context binary. No need to make the EPContext node name match the QNN graph name. This is for better backward compatibility in case the QNN context model is generated before the PR for QNN context binary model support multi-partition. --- .../qnn/builder/onnx_ctx_model_helper.cc | 6 +- .../qnn/builder/onnx_ctx_model_helper.h | 3 +- .../qnn/builder/qnn_backend_manager.cc | 15 ++-- .../providers/qnn/qnn_execution_provider.cc | 3 +- .../test/providers/qnn/qnn_ep_context_test.cc | 83 ++++++++++++++++++- 5 files changed, 99 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc index c2e71081b898e..2d8ec295d613b 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc @@ -151,12 +151,14 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node, Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::PathString& ctx_onnx_model_path, QnnBackendManager* qnn_backend_manager, - std::unordered_map>& qnn_models) { + std::unordered_map>& qnn_models, + const logging::Logger& logger) { Status status = GetEpContextFromMainNode(*graph_viewer.Nodes().begin(), ctx_onnx_model_path, qnn_backend_manager, qnn_models); // This is the protocol with customer that status with INVALID_GRAPH will be generated if failed to load context model if (!status.IsOK()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContextModel. ", status.ErrorMessage()); + LOGS(logger, ERROR) << "Failed to load from EpContext model. " << status.ErrorMessage(); + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContext model. ", status.ErrorMessage()); } return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h index b1360b4e576fa..7d56b45a1dbcd 100644 --- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h @@ -56,7 +56,8 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node, Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::PathString& ctx_onnx_model_path, QnnBackendManager* qnn_backend_manager, - std::unordered_map>& qnn_models); + std::unordered_map>& qnn_models, + const logging::Logger& logger); Status CreateEPContextNodes(Model* model, unsigned char* buffer, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 5f0b87c7cb9d7..ca34a1efa6ca7 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -573,11 +573,16 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t // More work to support multiple partition, how to map the graph name in compile to qnn graph name // Need the lower level framework to understand EPContext op and pass in the partition_name in fused_node during Compile - for (uint32_t i = 0; i < graph_count; ++i) { - std::string graph_name(graphs_info[i].graphInfoV1.graphName); - auto qnn_model_pos = qnn_models.find(graph_name); - ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names."); - ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i])); + if (1 == graph_count) { + auto qnn_model_pose = qnn_models.begin(); + ORT_RETURN_IF_ERROR(qnn_model_pose->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[0])); + } else { + for (uint32_t i = 0; i < graph_count; ++i) { + std::string graph_name(graphs_info[i].graphInfoV1.graphName); + auto qnn_model_pos = qnn_models.find(graph_name); + ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names."); + ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i])); + } } qnn_sys_interface_.systemContextFree(sys_ctx_handle); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index f5a166d36b15a..9a6540a3efea5 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -670,7 +670,8 @@ Status QNNExecutionProvider::Compile(const std::vector& fused ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxGraph(main_ctx_graph_viewer, context_cache_path, qnn_backend_manager_.get(), - qnn_models)); + qnn_models, + logger)); for (auto fused_node_and_graph : fused_nodes_and_graphs) { const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph); diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index b1f3b52e77553..eaef6f6315157 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -463,7 +463,6 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) { InferenceSessionWrapper session_object{so, GetEnvironment()}; - std::string provider_type = kCpuExecutionProvider; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast(qnn_ctx_model_data.size()))); // Verify the return status with code INVALID_GRAPH @@ -486,7 +485,6 @@ std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) { auto* graph_output = helper.MakeOutput(shape); Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain); ep_context_node.AddAttribute("embed_mode", static_cast(0)); - // The .. in the path will cause INVALID_GRAPH ep_context_node.AddAttribute("ep_cache_context", external_bin_path); ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0"); ep_context_node.AddAttribute("source", "QNN"); @@ -651,6 +649,87 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) { ASSERT_EQ(std::remove(context_binary_file.c_str()), 0); } +// Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node +// Create another Onnx model which also reference to the bin file, +// but the node name is not same with the QNN graph name inside the bin file. +// This is to support backward compitable for the models generated before the PR that +// make context generation support multi-partition +TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) { + ProviderOptions provider_options; +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx"; + std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"; + std::remove(context_binary_file.c_str()); + std::remove(context_bin.string().c_str()); + + std::unordered_map session_option_pairs; + session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1"); + session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file); + session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0"); + + const TestInputDef input_def({1, 2, 3}, false, -10.0f, 10.0f); + const std::string op_type = "Atan"; + + // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs. + // 1st run will generate the Onnx skeleton file + Qnn context cache binary file + TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def}, {}, {}), + BuildQDQOpTestCase(op_type, {input_def}, {}, {}), + provider_options, + 14, + ExpectedEPNodeAssignment::All, + QDQTolerance(), + logging::Severity::kERROR, + "", // context model file path, not required for this inference + session_option_pairs); + + // Check the Onnx skeleton file is generated + EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); + // Check the Qnn context cache binary file is generated + EXPECT_TRUE(std::filesystem::exists(context_bin)); + + const std::unordered_map domain_to_version = {{"", 11}, {kMSDomain, 1}}; + auto& logging_manager = DefaultLoggingManager(); + onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(), + IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {}, + logging_manager.DefaultLogger()); + Graph& graph = model.MainGraph(); + ModelTestBuilder helper(graph); + std::vector shape = {1, 2, 3}; + NodeArg* graph_input = MakeTestInput(helper, TestInputDef(shape, false, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f})); + auto* graph_output = helper.MakeOutput(shape); + Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain); + ep_context_node.AddAttribute("embed_mode", static_cast(0)); + ep_context_node.AddAttribute("ep_cache_context", context_bin.string()); + ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0"); + ep_context_node.AddAttribute("source", "QNNExecutionProvider"); + helper.SetGraphOutputs(); + ASSERT_STATUS_OK(graph.Resolve()); + std::string model_data; + model.ToProto().SerializeToString(&model_data); + + // loads and run from Onnx skeleton file + Qnn context cache binary file + + SessionOptions so; + so.session_logid = "qnn_ctx_model_logger"; + RunOptions run_options; + run_options.run_tag = so.session_logid; + + InferenceSessionWrapper session_object{so, GetEnvironment()}; + + ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); + ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); + // Verify the return status with code INVALID_GRAPH + ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::OK); + + // Clean up + ASSERT_EQ(std::remove(context_binary_file.c_str()), 0); + ASSERT_EQ(std::remove(context_bin.string().c_str()), 0); +} + #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) } // namespace test From 76a2a487a12c7ec579f453a36932429164494ef6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:58:17 -0800 Subject: [PATCH 32/68] Bump ip from 1.1.8 to 1.1.9 in /js/react_native/e2e (#19583) Bumps [ip](https://github.com/indutny/node-ip) from 1.1.8 to 1.1.9.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ip&package-manager=npm_and_yarn&previous-version=1.1.8&new-version=1.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) Dependabot will merge this PR once CI passes on it, as requested by @fs-eire. [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/react_native/e2e/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/react_native/e2e/yarn.lock b/js/react_native/e2e/yarn.lock index 9e20a286c4e27..6f05faf046098 100644 --- a/js/react_native/e2e/yarn.lock +++ b/js/react_native/e2e/yarn.lock @@ -3351,9 +3351,9 @@ invariant@^2.2.4: loose-envify "^1.0.0" ip@^1.1.5: - version "1.1.8" - resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48" - integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg== + version "1.1.9" + resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396" + integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ== is-accessor-descriptor@^0.1.6: version "0.1.6" From 5e5c36f6df95dfbb25787ea385f733f8c9ef691e Mon Sep 17 00:00:00 2001 From: AtomicVar Date: Fri, 23 Feb 2024 09:03:56 +0800 Subject: [PATCH 33/68] Fix citation author name issue (#19597) Use `name` rather than `given-names` to set author name. ### Motivation and Context The old CITATION.cff uses `given-names` to set author names, which won't be rendered properly with some bibtex style of LaTeX: image The problem is that **the `"ONNX Runtime developers"` is regarded as a human name**. How to fix: by using `name` to set author name, the generated Bibtex entry will use `{}` to enclose the `"ONNX Runtime developers"`. Then it is displayed literally: image --- CITATION.cff | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 82bcac5a7b750..10b7290022aef 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,8 +3,7 @@ title: ONNX Runtime message: "Please use this information to cite ONNX Runtime in research or other publications." authors: - - affiliation: Microsoft Corporation - given-names: ONNX Runtime developers + - name: ONNX Runtime developers date-released: 2018-11-29 url: "https://onnxruntime.ai" repository-code: "https://github.com/microsoft/onnxruntime" From 4ab497603e915ca992b96ef1ec25bfcf8b9a2ad5 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Thu, 22 Feb 2024 17:04:59 -0800 Subject: [PATCH 34/68] Enable user to set QNN HTP performance mode for every session run (#19521) ### Description Currently, the QNN HTP performance mode is set during session creation, there's no way to change it afterwards. There's requirement to set it high performance mode for high priority request and set it back to low performance mode later to save the power when the incoming request is idle for example. Now, still keeps the performance mode at the session level in QNN EP options which is used at the default one. Ort QNN EP will set it once if user set it. And there are setting (qnn.htp_perf_mode and qnn.htp_perf_mode_post_run) in run option to change the performance mode before and after session run. There's recommended scenario that user set the mode to high performance mode before the the inference sun so that user can get the result back ASAP. And set the mode to low performance mode after the inference to save the power. --- .../core/framework/execution_provider.h | 10 +- .../onnxruntime_run_options_config_keys.h | 12 + .../framework/stream_execution_context.cc | 4 +- .../providers/cann/cann_execution_provider.cc | 2 +- .../providers/cann/cann_execution_provider.h | 2 +- .../providers/cuda/cuda_execution_provider.cc | 4 +- .../providers/cuda/cuda_execution_provider.h | 5 +- .../src/ExecutionProvider.h | 4 +- .../providers/js/js_execution_provider.cc | 4 +- .../core/providers/js/js_execution_provider.h | 4 +- .../migraphx/migraphx_execution_provider.cc | 4 +- .../migraphx/migraphx_execution_provider.h | 4 +- .../qnn/builder/qnn_backend_manager.cc | 75 +++--- .../qnn/builder/qnn_backend_manager.h | 19 +- .../providers/qnn/qnn_execution_provider.cc | 198 +++++++++++++++- .../providers/qnn/qnn_execution_provider.h | 73 +++++- .../providers/rocm/rocm_execution_provider.cc | 4 +- .../providers/rocm/rocm_execution_provider.h | 4 +- .../tensorrt/tensorrt_execution_provider.cc | 4 +- .../tensorrt/tensorrt_execution_provider.h | 4 +- onnxruntime/core/session/inference_session.cc | 12 +- .../cuda_execution_provider_test.cc | 13 +- .../test/providers/qnn/qnn_basic_test.cc | 217 ++++++++++++++++-- 23 files changed, 577 insertions(+), 105 deletions(-) diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h index 31c988f500779..c1cc69edc17d8 100644 --- a/include/onnxruntime/core/framework/execution_provider.h +++ b/include/onnxruntime/core/framework/execution_provider.h @@ -33,6 +33,8 @@ class Node; #include "core/framework/stream_handles.h" #include "core/framework/tuning_context.h" +struct OrtRunOptions; + namespace onnxruntime { /** @@ -51,6 +53,8 @@ struct NodeComputeInfo { DestroyFunctionStateFunc release_state_func; }; +using RunOptions = OrtRunOptions; + enum class DataLayout { NCHW, NHWC, @@ -184,7 +188,7 @@ class IExecutionProvider { Run may not be finished on device This function should be regarded as the point after which a new Run would start to submit commands from CPU */ - virtual common::Status OnRunStart() { return Status::OK(); } + virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); } /** Called when InferenceSession::Run ended @@ -192,7 +196,9 @@ class IExecutionProvider { may not be finished on device This function should be regarded as the point that all commands of current Run has been submmited by CPU */ - virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); } + virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) { + return Status::OK(); + } /** Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h index 1f5fcd50e185c..b0a17e175fef3 100644 --- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h @@ -30,3 +30,15 @@ static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memor // Per default it will be set to '0' // Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream. static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers"; + +// Set HTP performance mode for QNN HTP backend before session run. +// options for HTP performance mode: "burst", "balanced", "default", "high_performance", +// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver", +// "sustained_high_performance". Default to "default". +static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode"; + +// Set HTP performance mode for QNN HTP backend post session run. +static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run"; + +// Set RPC control latency for QNN HTP backend +static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency"; diff --git a/onnxruntime/core/framework/stream_execution_context.cc b/onnxruntime/core/framework/stream_execution_context.cc index 875e7f395bfa8..dd7f4d35b34bd 100644 --- a/onnxruntime/core/framework/stream_execution_context.cc +++ b/onnxruntime/core/framework/stream_execution_context.cc @@ -181,11 +181,13 @@ void RunSince(size_t stream_idx, StreamExecutionContext& ctx, SessionScope& sess } #ifdef USE_CANN + // Leave it to CANN EP to fill the gap if they want to use run_options + static onnxruntime::RunOptions run_options; // For CANN EP, it is necessary to explicitly create a corresponding Context for each thread in the thread pool, // which is different from CUDA Runtime API, but similar to CUDA Driver API. auto& execution_providers = ctx.GetSessionState().GetExecutionProviders(); for (auto& xp : execution_providers) { - auto status = xp->OnRunStart(); + auto status = xp->OnRunStart(run_options); if (!status.IsOK()) { ctx.SetStatus(status); return; diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index 752b742805a7c..9a242919665bb 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -1045,7 +1045,7 @@ CANNExecutionProvider::~CANNExecutionProvider() { } // All threads share the same context and stream -Status CANNExecutionProvider::OnRunStart() { +Status CANNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { CANN_RETURN_IF_ERROR(aclrtSetDevice(info_.device_id)); return Status::OK(); diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h index 63ae980869c65..d83bd88d6958f 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.h +++ b/onnxruntime/core/providers/cann/cann_execution_provider.h @@ -33,7 +33,7 @@ class CANNExecutionProvider : public IExecutionProvider { explicit CANNExecutionProvider(const CANNExecutionProviderInfo& info); virtual ~CANNExecutionProvider(); - Status OnRunStart() override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; template Status Fill(Tensor* y, void* addr, aclrtStream stream) const { diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 48a952e6dd98f..0dd568c5ecc05 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -386,7 +386,7 @@ Status CUDAExecutionProvider::Sync() const { return Status::OK(); } -Status CUDAExecutionProvider::OnRunStart() { +Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { // always set CUDA device when session::Run() in case it runs in a worker thread CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId())); if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) { @@ -396,7 +396,7 @@ Status CUDAExecutionProvider::OnRunStart() { return Status::OK(); } -Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) { +Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) { if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) { if (GetPerThreadContext().IsGraphCaptureAllowed()) { GetPerThreadContext().CaptureEnd(); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 55f0b5570e0ee..5f62f313b86a2 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -29,9 +29,9 @@ class CUDAExecutionProvider : public IExecutionProvider { Status Sync() const override; - Status OnRunStart() override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; - Status OnRunEnd(bool sync_stream) override; + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; DataLayout GetPreferredLayout() const override; @@ -115,6 +115,7 @@ class CUDAExecutionProvider : public IExecutionProvider { PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy, CUDAExecutionProviderExternalAllocatorInfo external_alloc_info, OrtArenaCfg* arena_cfg); ~PerThreadContext(); + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext); cublasHandle_t CublasHandle() const { return cublas_handle_; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h index 5617bc7bdcac6..841d6244a983e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h @@ -270,7 +270,7 @@ namespace Dml return m_impl->OnSessionInitializationEnd(); } - virtual onnxruntime::Status Sync() const final override + onnxruntime::Status Sync() const final override { // Completely wait until the device has completed all preceding tasks. // The application could have called SynchronizeBoundOutputs(). @@ -278,7 +278,7 @@ namespace Dml return Status::OK(); } - virtual onnxruntime::Status OnRunEnd(bool /*sync_stream*/) final override + onnxruntime::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) final override { // Flush any pending work to the GPU, but don't block for completion, permitting it // to overlap other work. diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index 799d4172f2b64..62c3981682cfc 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -756,7 +756,7 @@ std::unique_ptr JsExecutionProvider::GetDataTransfer JsExecutionProvider::~JsExecutionProvider() { } -Status JsExecutionProvider::OnRunStart() { +Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured()) { LOGS(*GetLogger(), INFO) << "Capturing the webgpu graph for this model"; EM_ASM({ Module.jsepCaptureBegin(); }); @@ -764,7 +764,7 @@ Status JsExecutionProvider::OnRunStart() { return Status::OK(); } -Status JsExecutionProvider::OnRunEnd(bool sync_stream) { +Status JsExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) { if (IsGraphCaptureEnabled() && !IsGraphCaptured()) { if (IsGraphCaptureAllowed()) { EM_ASM({ Module.jsepCaptureEnd(); }); diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h index 91a3256ec2bd5..b4518c67d1e60 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.h +++ b/onnxruntime/core/providers/js/js_execution_provider.h @@ -59,8 +59,8 @@ class JsExecutionProvider : public IExecutionProvider { std::vector CreatePreferredAllocators() override; - Status OnRunStart() override; - Status OnRunEnd(bool sync_stream) override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; bool IsGraphCaptureEnabled() const override; bool IsGraphCaptured() const override; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 40e76a0a67782..50782569ee80a 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -1383,11 +1383,11 @@ Status MIGraphXExecutionProvider::Sync() const { return Status::OK(); } -Status MIGraphXExecutionProvider::OnRunStart() { +Status MIGraphXExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); } -Status MIGraphXExecutionProvider::OnRunEnd(bool) { +Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) { auto status = hipStreamQuery(stream_); if (status != hipSuccess) { diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h index d582338c7e067..c3617f409e72c 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h @@ -56,9 +56,9 @@ class MIGraphXExecutionProvider : public IExecutionProvider { #ifdef MIGRAPHX_STREAM_SYNC Status Sync() const override; - Status OnRunStart() override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; - Status OnRunEnd(bool sync_stream) override; + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; #endif std::vector> diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index ca34a1efa6ca7..e354bf6562722 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -634,11 +634,6 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_ LOGS(logger, VERBOSE) << "CreateContext succeed."; } - if (htp_performance_mode_ != HtpPerformanceMode::kHtpDefault) { - ORT_RETURN_IF_ERROR(SetHtpPowerConfig()); - LOGS(logger, VERBOSE) << "SetHtpPowerConfig succeed."; - } - LOGS(logger, VERBOSE) << "QNN SetupBackend succeed"; backend_setup_completed_ = true; @@ -646,7 +641,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_ return Status::OK(); } -Status QnnBackendManager::SetHtpPowerConfig() { +Status QnnBackendManager::CreateHtpPowerCfgId(uint32_t device_id, uint32_t core_id, uint32_t& htp_power_config_id) { QnnDevice_Infrastructure_t qnn_device_infra = nullptr; auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); @@ -656,23 +651,37 @@ Status QnnBackendManager::SetHtpPowerConfig() { "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; // Get power client id - status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_); + status = htp_perf_infra.createPowerConfigId(device_id, core_id, &htp_power_config_id); ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed."); + return Status::OK(); +} + +Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id, + HtpPerformanceMode htp_performance_mode) { + QnnDevice_Infrastructure_t qnn_device_infra = nullptr; + auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); + ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); + + auto* htp_infra = static_cast(qnn_device_infra); + ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType, + "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); + QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; + constexpr const int kNumConfigs = 1; std::vector power_configs( kNumConfigs); QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0]; dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3; QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config; - dcvs_v3.contextId = htp_power_config_client_id_; + dcvs_v3.contextId = htp_power_config_client_id; dcvs_v3.setSleepDisable = 0; dcvs_v3.sleepDisable = 0; dcvs_v3.setDcvsEnable = 1; dcvs_v3.dcvsEnable = kDcvsDisable; dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE; // choose performance mode - switch (htp_performance_mode_) { + switch (htp_performance_mode) { case HtpPerformanceMode::kHtpBurst: dcvs_v3.setSleepLatency = 1; // true dcvs_v3.sleepLatency = kSleepMinLatency; @@ -771,25 +780,40 @@ Status QnnBackendManager::SetHtpPowerConfig() { dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS; break; default: - ORT_THROW("Invalid performance profile %d", static_cast(htp_performance_mode_)); + ORT_THROW("Invalid performance profile %d", static_cast(htp_performance_mode)); break; } std::vector perf_power_configs_ptr = ObtainNullTermPtrVector(power_configs); - status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data()); + status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data()); ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode."); - // Set rpc control latency here, but note that v68 doesn't support rpc polling mode. - if (rpc_control_latency_ != 0) { + return Status::OK(); +} + +Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id, + uint32_t rpc_control_latency) { + if (rpc_control_latency != 0) { + QnnDevice_Infrastructure_t qnn_device_infra = nullptr; + auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); + ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); + + auto* htp_infra = static_cast(qnn_device_infra); + ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType, + "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); + QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; + + // Set rpc control latency here, but note that v68 doesn't support rpc polling mode. constexpr int kNumRpcPollingPowerConfigs = 2; std::vector rpc_power_configs(kNumRpcPollingPowerConfigs); - QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency = rpc_power_configs[0]; + QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0]; // v68 doesn't support this. QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1]; - rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_; - perf_power_configs_ptr = ObtainNullTermPtrVector(rpc_power_configs); - status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data()); + rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency; + std::vector perf_power_configs_ptr = + ObtainNullTermPtrVector(rpc_power_configs); + status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data()); ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency."); } @@ -810,11 +834,7 @@ void QnnBackendManager::Split(std::vector& split_string, } } -Status QnnBackendManager::DestroyHTPPowerConfigID() { - if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) { - return Status::OK(); - } - +Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) { QnnDevice_Infrastructure_t qnn_device_infra = nullptr; auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); @@ -824,7 +844,7 @@ Status QnnBackendManager::DestroyHTPPowerConfigID() { "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; - Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_); + Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_id); ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed."); return Status::OK(); } @@ -834,12 +854,7 @@ void QnnBackendManager::ReleaseResources() { return; } - auto result = DestroyHTPPowerConfigID(); - if (Status::OK() != result) { - ORT_THROW("Failed to DestroyHTPPowerConfigID."); - } - - result = ReleaseContext(); + auto result = ReleaseContext(); if (Status::OK() != result) { ORT_THROW("Failed to ReleaseContext."); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 36375522b5a0a..ff97c4c3a991c 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -33,8 +33,6 @@ class QnnBackendManager { public: QnnBackendManager(std::string&& backend_path, ProfilingLevel profiling_level, - uint32_t rpc_control_latency, - HtpPerformanceMode htp_performance_mode, ContextPriority context_priority, std::string&& qnn_saver_path, uint32_t device_id, @@ -42,8 +40,6 @@ class QnnBackendManager { uint32_t soc_model) : backend_path_(backend_path), profiling_level_(profiling_level), - rpc_control_latency_(rpc_control_latency), - htp_performance_mode_(htp_performance_mode), context_priority_(context_priority), qnn_saver_path_(qnn_saver_path), device_id_(device_id), @@ -92,7 +88,13 @@ class QnnBackendManager { Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context); - Status SetHtpPowerConfig(); + Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id); + + Status SetHtpPowerConfig(uint32_t htp_power_config_client_id, + HtpPerformanceMode htp_performance_mode); + + Status SetRpcControlLatency(uint32_t htp_power_config_client_id, + uint32_t rpc_control_latency); const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; } @@ -141,6 +143,8 @@ class QnnBackendManager { const std::string& GetSdkVersion() { return sdk_build_version_; } + Status DestroyHTPPowerConfigID(uint32_t htp_power_config_id); + private: void* LoadLib(const char* file_name, int flags, std::string& error_msg); @@ -150,8 +154,6 @@ class QnnBackendManager { Status UnloadLib(void* handle); - Status DestroyHTPPowerConfigID(); - void* LibFunction(void* handle, const char* symbol, std::string& error_msg); template @@ -232,15 +234,12 @@ class QnnBackendManager { QnnBackendType qnn_backend_type_ = QnnBackendType::CPU; Qnn_ProfileHandle_t profile_backend_handle_ = nullptr; std::vector op_package_paths_; - uint32_t rpc_control_latency_ = 0; - HtpPerformanceMode htp_performance_mode_; ContextPriority context_priority_; std::string sdk_build_version_ = ""; #ifdef _WIN32 std::set mod_handles_; #endif const std::string qnn_saver_path_; - uint32_t htp_power_config_client_id_ = 0; uint32_t device_id_ = 0; QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE; uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN; diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 9a6540a3efea5..3d9cfd92b7922 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -7,6 +7,7 @@ #include "core/framework/compute_capability.h" #include "core/graph/graph_viewer.h" #include "core/session/onnxruntime_session_options_config_keys.h" +#include "core/session/onnxruntime_run_options_config_keys.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/framework/kernel_registry.h" #include "core/platform/env.h" @@ -18,11 +19,36 @@ #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/providers/qnn/builder/qnn_def.h" #include "core/providers/qnn/builder/onnx_ctx_model_helper.h" +#include "core/framework/run_options.h" namespace onnxruntime { constexpr const char* QNN = "QNN"; +static std::unique_ptr>> s_run_on_unload_; + +void RunOnUnload(std::function function) { + OrtMutex mutex; + std::lock_guard guard(mutex); + if (!s_run_on_unload_) { + s_run_on_unload_ = std::make_unique>>(); + } + s_run_on_unload_->push_back(std::move(function)); +} + +struct OnUnload { + ~OnUnload() { + if (!s_run_on_unload_) + return; + + for (auto& function : *s_run_on_unload_) + function(); + + s_run_on_unload_.reset(); + } + +} g_on_unload; + static void ParseProfilingLevel(std::string profiling_level_string, qnn::ProfilingLevel& profiling_level) { std::transform(profiling_level_string.begin(), @@ -193,18 +219,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency"; - uint32_t rpc_control_latency = 0; auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY); if (latency_pos != provider_options_map.end()) { - rpc_control_latency = static_cast(std::stoul(latency_pos->second)); - LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency; + default_rpc_control_latency_ = static_cast(std::stoul(latency_pos->second)); + LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << default_rpc_control_latency_; } - qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault; + // default_htp_performance_mode from QNN EP option. + // set it once only for each thread as default so user don't need to set it for every session run static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode"; auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE); if (htp_performance_mode_pos != provider_options_map.end()) { - ParseHtpPerformanceMode(htp_performance_mode_pos->second, htp_performance_mode); + ParseHtpPerformanceMode(htp_performance_mode_pos->second, default_htp_performance_mode_); } htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault; @@ -241,15 +267,14 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } static const std::string QNN_DEVICE_ID = "device_id"; - uint32_t device_id = 0; auto dev_id_pos = provider_options_map.find(QNN_DEVICE_ID); if (dev_id_pos != provider_options_map.end()) { int value = std::stoi(dev_id_pos->second); if (value < 0) { LOGS_DEFAULT(WARNING) << "Invalid device ID '" << value - << "', only >= 0 allowed. Set to " << device_id << "."; + << "', only >= 0 allowed. Set to " << device_id_ << "."; } else { - device_id = static_cast(value); + device_id_ = static_cast(value); } } @@ -276,15 +301,23 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio qnn_backend_manager_ = std::make_unique( std::move(backend_path), profiling_level, - rpc_control_latency, - htp_performance_mode, context_priority, std::move(qnn_saver_path), - device_id, + device_id_, htp_arch, soc_model); } +QNNExecutionProvider::~QNNExecutionProvider() { + // clean up thread local context caches + std::lock_guard lock(context_state_.mutex); + for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { + const auto cache = cache_weak.lock(); + if (!cache) continue; + ORT_IGNORE_RETURN_VALUE(cache->erase(this)); + } +} + bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger) const { const std::string& op_type = node_unit.OpType(); @@ -725,4 +758,147 @@ const InlinedVector QNNExecutionProvider::GetEpContextNodes() const return ep_context_nodes; } + +QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager, + uint32_t device_id, + uint32_t core_id, + qnn::HtpPerformanceMode default_htp_performance_mode, + uint32_t default_rpc_control_latency) + : qnn_backend_manager_(qnn_backend_manager) { + Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_); + is_htp_power_config_id_valid_ = rt.IsOK(); + // default_htp_performance_mode and default_rpc_control_latency are from QNN EP option. + // set it once only for each thread as default so user don't need to set it for every session run + if (is_htp_power_config_id_valid_) { + if (qnn::HtpPerformanceMode::kHtpDefault != default_htp_performance_mode) { + ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_, + default_htp_performance_mode)); + } + if (default_rpc_control_latency > 0) { + ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_, + default_rpc_control_latency)); + } + } +} + +QNNExecutionProvider::PerThreadContext::~PerThreadContext() { + if (is_htp_power_config_id_valid_) { + ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->DestroyHTPPowerConfigID(htp_power_config_id_)); + } +} + +QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContext() const { + const auto& per_thread_context_cache = PerThreadContextCache(); + + // try to use cached context + auto cached_context_it = per_thread_context_cache->find(this); + if (cached_context_it != per_thread_context_cache->end()) { + auto cached_context = cached_context_it->second.lock(); + ORT_ENFORCE(cached_context); + return *cached_context; + } + + // get context and update cache + std::shared_ptr context; + { + std::lock_guard lock(context_state_.mutex); + + // get or create a context + if (context_state_.retired_context_pool.empty()) { + uint32_t core_id = 0; + context = std::make_shared(qnn_backend_manager_.get(), device_id_, core_id, + default_htp_performance_mode_, default_rpc_control_latency_); + } else { + context = context_state_.retired_context_pool.back(); + context_state_.retired_context_pool.pop_back(); + } + + // insert into active_contexts, should not already be present + const auto active_contexts_insert_result = context_state_.active_contexts.insert(context); + ORT_ENFORCE(active_contexts_insert_result.second); + + // insert into caches_to_update_on_destruction, may already be present + ORT_IGNORE_RETURN_VALUE(context_state_.caches_to_update_on_destruction.insert(per_thread_context_cache)); + } + + per_thread_context_cache->insert(std::make_pair(this, context)); + + return *context; +} + +void QNNExecutionProvider::ReleasePerThreadContext() const { + const auto& per_thread_context_cache = PerThreadContextCache(); + + auto cached_context_it = per_thread_context_cache->find(this); + ORT_ENFORCE(cached_context_it != per_thread_context_cache->end()); + auto cached_context = cached_context_it->second.lock(); + ORT_ENFORCE(cached_context); + + { + std::lock_guard lock(context_state_.mutex); + context_state_.active_contexts.erase(cached_context); + context_state_.retired_context_pool.push_back(cached_context); + } + + per_thread_context_cache->erase(cached_context_it); +} + +Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) { + auto backend_type = qnn_backend_manager_->GetQnnBackendType(); + if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) { + return Status::OK(); + } + + std::string htp_perf_mode = ""; + qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault; + if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) { + // set power mode + ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode); + } + + std::string rpc_latency = ""; + uint32_t rpc_control_latency = 0; + if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) { + rpc_control_latency = static_cast(std::stoul(rpc_latency)); + LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency; + } + + if (GetPerThreadContext().IsHtpPowerConfigIdValid()) { + if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) { + ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(), + htp_performance_mode)); + } + + if (rpc_control_latency > 0) { + ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(), + rpc_control_latency)); + } + } + + return Status::OK(); +} + +Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& run_options) { + auto backend_type = qnn_backend_manager_->GetQnnBackendType(); + if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) { + return Status::OK(); + } + + std::string htp_perf_mode = ""; + qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault; + if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) { + // set power mode + ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode); + } + + if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) { + if (!GetPerThreadContext().IsHtpPowerConfigIdValid()) { + return Status::OK(); + } + ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(), + htp_performance_mode)); + } + + return Status::OK(); +} } // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 0bcaa39b22f6d..43b5e7bff827e 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -12,14 +12,19 @@ #include "core/providers/qnn/builder/qnn_model.h" #include "core/providers/qnn/builder/qnn_configs_helper.h" #include "HTP/QnnHtpGraph.h" +#include +#include +#include namespace onnxruntime { +void RunOnUnload(std::function function); + // Logical device representation. class QNNExecutionProvider : public IExecutionProvider { public: explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options); - virtual ~QNNExecutionProvider() = default; + virtual ~QNNExecutionProvider(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider); // we implement the Compile that takes FusedNodeAndGraph instances @@ -40,6 +45,10 @@ class QNNExecutionProvider : public IExecutionProvider { const InlinedVector GetEpContextNodes() const override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; + + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; + private: bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, const logging::Logger& logger) const; @@ -72,6 +81,68 @@ class QNNExecutionProvider : public IExecutionProvider { int32_t vtcm_size_in_mb_ = 0; std::unique_ptr qnn_ep_context_model_; ModelMetadefIdGenerator metadef_id_generator_; + uint32_t device_id_ = 0; + qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; + uint32_t default_rpc_control_latency_ = 0; + + class PerThreadContext final { + public: + PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager, + uint32_t device_id, uint32_t core_id, + qnn::HtpPerformanceMode default_htp_performance_mode, + uint32_t default_rpc_control_latency); + ~PerThreadContext(); + ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext); + + bool IsHtpPowerConfigIdValid() { return is_htp_power_config_id_valid_; } + + uint32_t GetHtpPowerConfigId() { return htp_power_config_id_; } + + private: + bool is_htp_power_config_id_valid_ = false; + uint32_t htp_power_config_id_ = 0; + qnn::QnnBackendManager* qnn_backend_manager_; + }; + + using PerThreadContextMap = std::unordered_map>; + + struct ContextCacheHolder { + ContextCacheHolder() { + RunOnUnload([&, weak_p_ = std::weak_ptr(p)] { + if (auto lock = weak_p_.lock()) + p.reset(); + }); + } + + std::shared_ptr p = std::make_shared(); + }; + + static const std::shared_ptr& PerThreadContextCache() { + thread_local const ContextCacheHolder per_thread_context_cache; + return per_thread_context_cache.p; + } + + struct PerThreadContextState { + // contexts that are currently active + std::set, std::owner_less>> active_contexts; + // contexts available for reuse + std::vector> retired_context_pool; + // weak references to thread local caches from which this QNNExecutionProvider instance's entry should be removed + // upon destruction + std::set, std::owner_less>> + caches_to_update_on_destruction; + // synchronizes access to PerThreadContextState members + OrtMutex mutex; + }; + + // The execution provider maintains the PerThreadContexts in this structure. + // Synchronization is required to update the contained structures. + // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time, + // so synchronization is not required for that. + mutable PerThreadContextState context_state_; + + PerThreadContext& GetPerThreadContext() const; + void ReleasePerThreadContext() const; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index ee3578326ac6d..3fd5423681b81 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -353,7 +353,7 @@ Status ROCMExecutionProvider::Sync() const { return Status::OK(); } -Status ROCMExecutionProvider::OnRunStart() { +Status ROCMExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { // always set ROCM device when session::Run() in case it runs in a worker thread HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId())); if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) { @@ -363,7 +363,7 @@ Status ROCMExecutionProvider::OnRunStart() { return Status::OK(); } -Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) { +Status ROCMExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) { if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) { if (GetPerThreadContext().IsGraphCaptureAllowed()) { GetPerThreadContext().CaptureEnd(); diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index 37d5f7b42210f..da671d9e863bb 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -28,9 +28,9 @@ class ROCMExecutionProvider : public IExecutionProvider { Status Sync() const override; - Status OnRunStart() override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; - Status OnRunEnd(bool sync_stream) override; + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; const void* GetExecutionHandle() const noexcept override { // The ROCM interface does not return anything interesting. diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index c0bf29e486c88..81346671f2aad 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1818,11 +1818,11 @@ std::unique_ptr TensorrtExecutionProvider::GetDataTransfer() cons return onnxruntime::CreateGPUDataTransfer(); } -Status TensorrtExecutionProvider::OnRunStart() { +Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); } -Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) { +Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) { if (sync_stream && external_stream_) { CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_)); } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index e86f997b6597a..26f6b2dcc3020 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -233,8 +233,8 @@ class TensorrtExecutionProvider : public IExecutionProvider { common::Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; - Status OnRunStart() override; - Status OnRunEnd(bool sync_stream) override; + Status OnRunStart(const onnxruntime::RunOptions& run_options) override; + Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; ProviderOptions GetProviderOptions() const override { return TensorrtExecutionProviderInfo::ToProviderOptions(info_); diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index b045f30a59797..efd7db4ea7629 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -2289,8 +2289,8 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options, // TODO: only call OnRunStart for all providers in-use for (auto& xp : execution_providers_) { // call OnRunStart and add to exec_providers_to_stop if successful - auto start_func = [&xp, &exec_providers_to_stop]() { - auto status = xp->OnRunStart(); + auto start_func = [&xp, &exec_providers_to_stop, run_options]() { + auto status = xp->OnRunStart(run_options); if (status.IsOK()) exec_providers_to_stop.push_back(xp.get()); @@ -2326,7 +2326,7 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options, // info all execution providers InferenceSession:Run ended for (auto* xp : exec_providers_to_stop) { - auto status = xp->OnRunEnd(/*sync_stream*/ false); + auto status = xp->OnRunEnd(/*sync_stream*/ false, run_options); ORT_CHECK_AND_SET_RETVAL(status); } @@ -2448,8 +2448,8 @@ Status InferenceSession::Run(const RunOptions& run_options, // TODO: only call OnRunStart for all providers in-use for (auto& xp : execution_providers_) { // call OnRunStart and add to exec_providers_to_stop if successful - auto start_func = [&xp, &exec_providers_to_stop]() { - auto status = xp->OnRunStart(); + auto start_func = [&xp, &exec_providers_to_stop, &run_options]() { + auto status = xp->OnRunStart(run_options); if (status.IsOK()) exec_providers_to_stop.push_back(xp.get()); @@ -2490,7 +2490,7 @@ Status InferenceSession::Run(const RunOptions& run_options, // info all execution providers InferenceSession:Run ended for (auto* xp : exec_providers_to_stop) { bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0"; - auto status = xp->OnRunEnd(synchronize_execution_providers); + auto status = xp->OnRunEnd(synchronize_execution_providers, run_options); ORT_CHECK_AND_SET_RETVAL(status); } diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc index a70e439cdf755..5505d689381c9 100644 --- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc @@ -22,6 +22,8 @@ TEST(TestDeferredRelease, WithArena) { CUDAExecutionProvider ep(info); AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0]; + RunOptions run_opts; + run_opts.run_tag = "log1"; // Allocator for call cudaMallocHost and cudaFreeHost // For details, see CUDAPinnedAllocator in cuda_allocator.cc. AllocatorPtr cpu_pinned_alloc = ep.CreatePreferredAllocators()[1]; @@ -31,7 +33,7 @@ TEST(TestDeferredRelease, WithArena) { // 10 MB const size_t n_bytes = 10 * 1000000; const int64_t n_allocs = 64; - ORT_THROW_IF_ERROR(ep.OnRunStart()); + ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts)); for (size_t i = 0; i < n_allocs; ++i) { // Allocate 10MB CUDA pinned memory. auto pinned_buffer = IAllocator::MakeUniquePtr(cpu_pinned_alloc, n_bytes); @@ -44,7 +46,7 @@ TEST(TestDeferredRelease, WithArena) { cpu_pinned_alloc->GetStats(&stats); ASSERT_EQ(stats.num_allocs, n_allocs); ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd()); - ORT_THROW_IF_ERROR(ep.OnRunEnd(true)); + ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts)); } TEST(TestDeferredRelease, WithoutArena) { @@ -52,6 +54,9 @@ TEST(TestDeferredRelease, WithoutArena) { CUDAExecutionProviderInfo info; CUDAExecutionProvider ep(info); + RunOptions run_opts; + run_opts.run_tag = "log1"; + OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID}; // Create allocator without BFCArena AllocatorCreationInfo pinned_memory_info( @@ -70,7 +75,7 @@ TEST(TestDeferredRelease, WithoutArena) { // 10 MB const size_t n_bytes = 10 * 1000000; const int64_t n_allocs = 64; - ORT_THROW_IF_ERROR(ep.OnRunStart()); + ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts)); for (size_t i = 0; i < n_allocs; ++i) { // Allocate 10MB CUDA pinned memory. auto pinned_buffer = IAllocator::MakeUniquePtr(cuda_pinned_alloc, n_bytes); @@ -79,7 +84,7 @@ TEST(TestDeferredRelease, WithoutArena) { } ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd()); - ORT_THROW_IF_ERROR(ep.OnRunEnd(true)); + ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts)); } } // namespace test diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 4e1aef2c40b2b..8f07c2ce77e77 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -7,6 +7,7 @@ #include "core/session/onnxruntime_cxx_api.h" #include "core/session/onnxruntime_session_options_config_keys.h" +#include "core/session/onnxruntime_run_options_config_keys.h" #include "core/providers/cpu/cpu_provider_factory.h" // For OrtSessionOptionsAppendExecutionProvider_CPU #include "core/session/inference_session.h" @@ -332,19 +333,23 @@ static void CreateModelInMemory(std::unique_ptr& result, static void RunSessionAndVerify(InferenceSession& session, const RunOptions& run_options, const NameMLValMap& feeds, const std::vector& output_names, const std::vector>& output_shapes, - const std::vector>& expected_values) { - std::vector fetches; - auto status = session.Run(run_options, feeds, output_names, &fetches); - ASSERT_TRUE(status.IsOK()); - - for (size_t i = 0; i < fetches.size(); i++) { - auto& tensor = fetches[i].Get(); - TensorShape expected_shape(output_shapes[i]); - ASSERT_EQ(expected_shape, tensor.Shape()); - - gsl::span actual = tensor.DataAsSpan(); - gsl::span expected(expected_values[i].data(), expected_values[i].size()); - ASSERT_EQ(expected, actual); + const std::vector>& expected_values, + int loop_count = 10) { + // Let it run for a while + for (int it = 0; it < loop_count; ++it) { + std::vector fetches; + auto status = session.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(status.IsOK()); + + for (size_t i = 0; i < fetches.size(); i++) { + auto& tensor = fetches[i].Get(); + TensorShape expected_shape(output_shapes[i]); + ASSERT_EQ(expected_shape, tensor.Shape()); + + gsl::span actual = tensor.DataAsSpan(); + gsl::span expected(expected_values[i].data(), expected_values[i].size()); + ASSERT_EQ(expected, actual); + } } } @@ -404,11 +409,11 @@ TEST_F(QnnCPUBackendTests, MultithreadSessionRun) { std::vector threads; constexpr int num_threads = 5; - + constexpr int loop_count = 10; for (int i = 0; i < num_threads; i++) { threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, model->builder.feeds_, model->builder.output_names_, - output_shapes, output_values)); + output_shapes, output_values, loop_count)); } for (auto& th : threads) { @@ -484,11 +489,191 @@ TEST_F(QnnHTPBackendTests, MultithreadSessionRun) { std::vector threads; constexpr int num_threads = 5; + constexpr int loop_count = 10; for (int i = 0; i < num_threads; i++) { threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, model->builder.feeds_, model->builder.output_names_, - output_shapes, output_values)); + output_shapes, output_values, loop_count)); + } + + for (auto& th : threads) { + th.join(); + } +} + +// Tests running a single session in multiple threads on the HTP backend with run option to set power config +TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) { + std::unique_ptr model; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector shape = {1, 3, 2}; + std::vector> output_shapes = {shape}; + std::vector> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}}; + + CreateModelInMemory(model, + QDQBuildAdd3Tensors(TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data)), + "add3.qdq"); + + SessionOptions session_opts; + session_opts.session_logid = "logger0"; + + InferenceSession session_obj{session_opts, GetEnvironment()}; + onnxruntime::ProviderOptions options; + +#if defined(_WIN32) + options["backend_path"] = "QnnHtp.dll"; +#else + options["backend_path"] = "libQnnHtp.so"; +#endif + + auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); + EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); + + auto status = session_obj.Load(model->model_data.data(), static_cast(model->model_data.size())); + ASSERT_TRUE(status.IsOK()); + status = session_obj.Initialize(); + ASSERT_TRUE(status.IsOK()); + + std::vector threads; + constexpr int num_threads = 5; + constexpr int loop_count = 10; + + std::vector perf_modes{ + "burst", "balanced", "default", "high_performance", "high_power_saver", + "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"}; + + size_t post_i = perf_modes.size() - 1; + ASSERT_TRUE(post_i > num_threads); + for (int i = 0; i < num_threads; ++i, --post_i) { + RunOptions run_opts; + run_opts.run_tag = session_opts.session_logid; + auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str()); + ASSERT_TRUE(rt.IsOK()); + rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str()); + ASSERT_TRUE(rt.IsOK()); + + threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, + model->builder.feeds_, model->builder.output_names_, + output_shapes, output_values, loop_count)); + } + + for (auto& th : threads) { + th.join(); + } +} + +// Tests running a single session in multiple threads on the HTP backend with EP option to set default power config +TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) { + std::unique_ptr model; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector shape = {1, 3, 2}; + std::vector> output_shapes = {shape}; + std::vector> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}}; + + CreateModelInMemory(model, + QDQBuildAdd3Tensors(TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data)), + "add3.qdq"); + + SessionOptions session_opts; + session_opts.session_logid = "logger0"; + + RunOptions run_opts; + run_opts.run_tag = session_opts.session_logid; + + InferenceSession session_obj{session_opts, GetEnvironment()}; + onnxruntime::ProviderOptions options; + +#if defined(_WIN32) + options["backend_path"] = "QnnHtp.dll"; +#else + options["backend_path"] = "libQnnHtp.so"; +#endif + options["htp_performance_mode"] = "burst"; + + auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); + EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); + + auto status = session_obj.Load(model->model_data.data(), static_cast(model->model_data.size())); + ASSERT_TRUE(status.IsOK()); + status = session_obj.Initialize(); + ASSERT_TRUE(status.IsOK()); + + std::vector threads; + constexpr int num_threads = 5; + constexpr int loop_count = 10; + + for (int i = 0; i < num_threads; i++) { + threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, + model->builder.feeds_, model->builder.output_names_, + output_shapes, output_values, loop_count)); + } + + for (auto& th : threads) { + th.join(); + } +} + +// Tests running a single session in multiple threads on the HTP backend with +// EP option to set default power config + run option to set power config for each run +TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) { + std::unique_ptr model; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector shape = {1, 3, 2}; + std::vector> output_shapes = {shape}; + std::vector> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}}; + + CreateModelInMemory(model, + QDQBuildAdd3Tensors(TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data)), + "add3.qdq"); + + SessionOptions session_opts; + session_opts.session_logid = "logger0"; + + InferenceSession session_obj{session_opts, GetEnvironment()}; + onnxruntime::ProviderOptions options; + +#if defined(_WIN32) + options["backend_path"] = "QnnHtp.dll"; +#else + options["backend_path"] = "libQnnHtp.so"; +#endif + options["htp_performance_mode"] = "burst"; + + auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); + EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); + + auto status = session_obj.Load(model->model_data.data(), static_cast(model->model_data.size())); + ASSERT_TRUE(status.IsOK()); + status = session_obj.Initialize(); + ASSERT_TRUE(status.IsOK()); + + std::vector threads; + constexpr int num_threads = 5; + constexpr int loop_count = 10; + + std::vector perf_modes{ + "burst", "balanced", "default", "high_performance", "high_power_saver", + "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"}; + + size_t post_i = perf_modes.size() - 1; + ASSERT_TRUE(post_i > num_threads); + for (int i = 0; i < num_threads; ++i, --post_i) { + RunOptions run_opts; + run_opts.run_tag = session_opts.session_logid; + auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str()); + ASSERT_TRUE(rt.IsOK()); + rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str()); + ASSERT_TRUE(rt.IsOK()); + + threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, + model->builder.feeds_, model->builder.output_names_, + output_shapes, output_values, loop_count)); } for (auto& th : threads) { From 29b1106033e291947debb49c3fd03feb479c4b1b Mon Sep 17 00:00:00 2001 From: Segev Finer Date: Fri, 23 Feb 2024 04:53:50 +0200 Subject: [PATCH 35/68] [node] Switch to setImmediate to avoid starving the Node.js event loop (#19610) ### Description Switch to setImmediate to avoid starving the Node.js event loop There should really be a true async version though, running computationally intensive things on the event loop will stop everything else from happening while it is running, e.g. a web server from answering requests. This can be done by wrapping `RunAsync` behind a [`napi::Promise`](https://github.com/nodejs/node-addon-api/blob/main/doc/promises.md) to run on the onnxruntime thread pool or [`AsyncWorker`]( https://github.com/nodejs/node-addon-api/blob/main/doc/async_worker.md) for the Node.js/libuv thread pool. ### Motivation and Context Without this, if you run inference in a tight loop, without anything else in between that is async/deferred, `process.nextTick` will lead to starving the event loop and not letting anything else run, `setImmediate` at least lets the event loop spin between calls to `run`. See https://dev.to/ynmanware/setimmediate-settimeout-and-process-nexttick-3mfd Contributed on behalf of [Swimm](https://swimm.io/) --- js/node/lib/backend.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts index e8eb0e9babf5a..927953b4f1dd6 100644 --- a/js/node/lib/backend.ts +++ b/js/node/lib/backend.ts @@ -36,7 +36,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler { async run(feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType, options: InferenceSession.RunOptions): Promise { return new Promise((resolve, reject) => { - process.nextTick(() => { + setImmediate(() => { try { resolve(this.#inferenceSession.run(feeds, fetches, options)); } catch (e) { @@ -56,7 +56,7 @@ class OnnxruntimeBackend implements Backend { async createInferenceSessionHandler(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions): Promise { return new Promise((resolve, reject) => { - process.nextTick(() => { + setImmediate(() => { try { resolve(new OnnxruntimeSessionHandler(pathOrBuffer, options || {})); } catch (e) { From ae92d593c0e2b06decbea64797f9145bc10f34af Mon Sep 17 00:00:00 2001 From: pengwa Date: Fri, 23 Feb 2024 11:05:16 +0800 Subject: [PATCH 36/68] ONNX Gelu Op in Opset 20 (#19560) ### ONNX Gelu Op in Opset 20 Refactor code to support MSDomain Gelu and ONNX Gelu-opset20 Op 1. Move CPU-GELU implmentation from `onnxruntime/contrib_ops/cpu/activations.h/cc` to `onnxruntime/core/providers/cpu/tensor/gelu.h/cc`, as the implementation for approximate attribute to be 'none'. 2. Dumplicate some logic from `onnxruntime/contrib_ops/cpu/bert/bias_gelu.cc` to `onnxruntime/core/providers/cpu/tensor/gelu.h/cc`, as the implementation for approximate attribute to be 'tanh'. 3. Register ONNX domain Gelu CPU kernel from opset 20 in `onnxruntime/core/providers/cpu/cpu_execution_provider.cc`. 4. Move `onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h/cu` to `onnxruntime/core/providers/cuda/tensor/gelu_impl.h` and `onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu` respectively, as the implementation for approximate attribute to be 'tanh'. 5. Implement the logic for approximate attribute to be 'none' in `onnxruntime/core/providers/cuda/tensor/gelu_impl.cu`. 6. Register ONNX domain Gelu CUDA kernel from opset 20 in `onnxruntime/core/providers/cuda/cuda_execution_provider.cc`. 7. ROCM ep related changes. 8. Enrich the tests for ONNX domain Gelu in `onnxruntime/test/providers/cpu/activation/activation_op_test.cc`. --- cmake/onnxruntime_rocm_hipify.cmake | 4 - .../InferenceTest.netcore.cs | 2 +- docs/OperatorKernels.md | 2 + .../core/providers/cuda/cuda_resource.h | 2 +- onnxruntime/contrib_ops/cpu/activations.cc | 10 +- onnxruntime/contrib_ops/cpu/activations.h | 41 ------- .../cuda/activation/activations.cc | 1 - .../contrib_ops/cuda/activation/activations.h | 11 -- .../cuda/activation/activations_impl.cu | 14 --- .../cuda/activation/activations_impl.h | 2 - .../contrib_ops/cuda/bert/fast_gelu.cc | 20 +++- onnxruntime/contrib_ops/cuda/bert/fast_gelu.h | 2 +- .../contrib_ops/rocm/bert/fast_gelu.cc | 59 ---------- onnxruntime/contrib_ops/rocm/bert/fast_gelu.h | 24 ---- .../providers/cpu/cpu_execution_provider.cc | 2 + onnxruntime/core/providers/cpu/tensor/gelu.cc | 108 ++++++++++++++++++ onnxruntime/core/providers/cpu/tensor/gelu.h | 18 +++ .../providers/cuda/cuda_execution_provider.cc | 10 ++ .../core/providers/cuda/tensor/gelu.cc | 89 +++++++++++++++ onnxruntime/core/providers/cuda/tensor/gelu.h | 28 +++++ .../cuda/tensor/gelu_approximate_impl.cu} | 17 ++- .../core/providers/cuda/tensor/gelu_impl.cu | 48 ++++++++ .../providers/cuda/tensor/gelu_impl.h} | 7 +- .../test/contrib_ops/activation_op_test.cc | 13 ++- .../test/onnx/microbenchmark/activation.cc | 3 +- .../cpu/activation/activation_op_test.cc | 48 ++++++-- .../cpu/activation/activation_op_test.h | 7 +- 27 files changed, 395 insertions(+), 197 deletions(-) delete mode 100644 onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc delete mode 100644 onnxruntime/contrib_ops/rocm/bert/fast_gelu.h create mode 100644 onnxruntime/core/providers/cpu/tensor/gelu.cc create mode 100644 onnxruntime/core/providers/cpu/tensor/gelu.h create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu.cc create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu.h rename onnxruntime/{contrib_ops/cuda/bert/fast_gelu_impl.cu => core/providers/cuda/tensor/gelu_approximate_impl.cu} (88%) create mode 100644 onnxruntime/core/providers/cuda/tensor/gelu_impl.cu rename onnxruntime/{contrib_ops/cuda/bert/fast_gelu_impl.h => core/providers/cuda/tensor/gelu_impl.h} (80%) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index 85a9bf50460d3..1bb70e9c2ed27 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -20,10 +20,6 @@ set(contrib_ops_excluded_files "bert/fastertransformer_decoder_attention/*" "bert/multihead_attention.cc" "bert/multihead_attention.h" - "bert/fast_gelu_impl.cu" - "bert/fast_gelu_impl.h" - "bert/fast_gelu.cc" - "bert/fast_gelu.h" "bert/relative_attn_bias.cc" "bert/relative_attn_bias.h" "bert/relative_attn_bias_impl.cu" diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs index 715aed7e1d64f..7f3d5d6624b07 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs @@ -145,7 +145,7 @@ private void TestCUDAProviderOptions() private void CanRunInferenceOnAModelWithTensorRT() { string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet.onnx"); - + int deviceId = 0; string deviceIdStr = System.Environment.GetEnvironmentVariable("ONNXRUNTIME_TEST_GPU_DEVICE_ID"); if (!string.IsNullOrEmpty(deviceIdStr) && int.TryParse(deviceIdStr, out int parsedValue) && parsedValue >= 0) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 8ff2135c6b1f6..46149c577a106 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -127,6 +127,7 @@ Do not modify directly.* |GatherND|*in* data:**T**
*in* indices:**tensor(int64)**
*out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**indices** = tensor(int64)| |||12|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**indices** = tensor(int64)| |||11|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**indices** = tensor(int64)| +|Gelu|*in* X:**T**
*out* Y:**T**|20+|**T** = tensor(float)| |Gemm|*in* A:**T**
*in* B:**T**
*in* C:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float)| |||[11, 12]|**T** = tensor(double), tensor(float)| |||[9, 10]|**T** = tensor(double), tensor(float)| @@ -606,6 +607,7 @@ Do not modify directly.* |GatherND|*in* data:**T**
*in* indices:**tensor(int64)**
*out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)
**indices** = tensor(int64)| |||12|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)
**indices** = tensor(int64)| |||11|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)
**indices** = tensor(int64)| +|Gelu|*in* X:**T**
*out* Y:**T**|20+|**T** = tensor(double), tensor(float), tensor(float16)| |Gemm|*in* A:**T**
*in* B:**T**
*in* C:**T**
*out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)| |||[9, 10]|**T** = tensor(double), tensor(float), tensor(float16)| diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h index 1fef077860be3..00e7dec5727d1 100644 --- a/include/onnxruntime/core/providers/cuda/cuda_resource.h +++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h @@ -19,4 +19,4 @@ enum CudaResource : int { enable_skip_layer_norm_strict_mode_t, prefer_nhwc_t, use_tf32_t, -}; \ No newline at end of file +}; diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc index 556699192d2eb..3e0533dd8b9e5 100644 --- a/onnxruntime/contrib_ops/cpu/activations.cc +++ b/onnxruntime/contrib_ops/cpu/activations.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include "core/providers/cpu/activation/activations.h" -#include "activations.h" +#include "contrib_ops/cpu/activations.h" namespace onnxruntime { namespace contrib { @@ -26,14 +26,6 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL( KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType()), ThresholdedRelu); -ONNX_OPERATOR_KERNEL_EX( - Gelu, - kMSDomain, - 1, - kCpuExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Gelu); - ONNX_OPERATOR_KERNEL_EX( QuickGelu, kMSDomain, diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h index aed4c2229215d..7e64235d3fc3d 100644 --- a/onnxruntime/contrib_ops/cpu/activations.h +++ b/onnxruntime/contrib_ops/cpu/activations.h @@ -54,47 +54,6 @@ namespace contrib { DEFINE_ELE_KERNEL(ScaledTanh); DEFINE_ELE_KERNEL(ParametricSoftplus); -template -class Gelu : public OpKernel { - public: - Gelu(const OpKernelInfo& info) : OpKernel(info) { - } - - Status Compute(OpKernelContext* context) const override { - const Tensor* input = context->Input(0); - const T* input_data = input->Data(); - - Tensor* output = context->Output(0, input->Shape()); - T* output_data = output->MutableData(); - - concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); - int64_t elem_count = input->Shape().Size(); - constexpr int64_t length_per_task = 4096; // this number comes from FastGelu. - int64_t task_count = (elem_count + length_per_task - 1) / length_per_task; - concurrency::ThreadPool::TryBatchParallelFor( - tp, static_cast(task_count), - [&](ptrdiff_t task_idx) { - const auto start = task_idx * length_per_task; - const T* p_input = input_data + start; - T* p_output = output_data + start; - int64_t count = std::min(length_per_task, elem_count - start); - - for (int64_t i = 0; i < count; i++) { - T value = p_input[i]; - p_output[i] = value * static_cast(M_SQRT1_2); - } - - MlasComputeErf(p_output, p_output, narrow(count)); - - for (int64_t i = 0; i < count; i++) { - p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f); - } - }, - 0); - return Status::OK(); - } -}; - // Implement a new one instead of inheriting from ElementWiseRangedTransform so that we can call // MlasComputeLogistic instead of using Eigen for better perf. template diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc index 1a86c5dbece5a..6303858b9bd48 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations.cc +++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc @@ -49,7 +49,6 @@ namespace cuda { UNARY_ACTIVATION_OP_HFD(Affine, 1, kOnnxDomain); UNARY_ACTIVATION_OP_HFD(ParametricSoftplus, 1, kOnnxDomain); UNARY_ACTIVATION_OP_HFD(ScaledTanh, 1, kOnnxDomain); -UNARY_ACTIVATION_OP_HFD(Gelu, 1, kMSDomain); UNARY_ACTIVATION_OP_HFD(QuickGelu, 1, kMSDomain); REGISTER_ACTIVATION_KERNEL(ThresholdedRelu, 1, kOnnxDomain, MLFloat16) diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.h b/onnxruntime/contrib_ops/cuda/activation/activations.h index ab339f276c2bd..fc9a71b0b7fa1 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations.h +++ b/onnxruntime/contrib_ops/cuda/activation/activations.h @@ -66,17 +66,6 @@ class ScaledTanh final : public UnaryElementwise { float beta_; }; -template -class Gelu final : public UnaryElementwise { - public: - Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {} - - Status ComputeInternal(OpKernelContext* context) const override; - - private: - MAKE_FUNC_CTX_NULL() -}; - template class QuickGelu final : public UnaryElementwise { public: diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu index 0c856815fd437..36f33fbb24c18 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu @@ -36,20 +36,6 @@ struct OP_ScaledTanh : public CtxScaledTanh { } }; -template -struct OP_Gelu : public CtxGelu { - __device__ __inline__ T operator()(const T& a) const { - return _Gelu(a); - } -}; - -template <> -struct OP_Gelu : public CtxGelu { - __device__ __inline__ half operator()(const half& a) const { - return static_cast(_Gelu(static_cast(a))); - } -}; - template struct OP_QuickGelu : public CtxQuickGelu { __device__ __inline__ T operator()(const T& a) const { diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h index 5d18283a395e3..782d4bf59a5ad 100644 --- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h +++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h @@ -11,14 +11,12 @@ namespace cuda { typedef onnxruntime::cuda::CtxAlphaBeta CtxAffine; typedef onnxruntime::cuda::CtxAlphaBeta CtxParametricSoftplus; typedef onnxruntime::cuda::CtxAlphaBeta CtxScaledTanh; -typedef onnxruntime::cuda::CtxNull CtxGelu; typedef onnxruntime::cuda::CtxAlpha CtxQuickGelu; #define UNARY_CONTRIB_ACTIVATION_OPS() \ UNARY_ACTIVATION_OP_NAME(ScaledTanh) \ UNARY_ACTIVATION_OP_NAME(Affine) \ UNARY_ACTIVATION_OP_NAME(ParametricSoftplus) \ - UNARY_ACTIVATION_OP_NAME(Gelu) \ UNARY_ACTIVATION_OP_NAME(QuickGelu) #define UNARY_ACTIVATION_OP_NAME(name) UNARY_ACTIVATION_IMPL_DECLARATION(name); diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc index 892f5c181a607..e8974a29476b6 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc +++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc @@ -4,9 +4,14 @@ #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/cudnn_common.h" #include "fast_gelu.h" -#include "fast_gelu_impl.h" +#include "core/providers/cuda/tensor/gelu_impl.h" #include "contrib_ops/cpu/bert/bias_gelu_helper.h" -#include "transformer_common.h" +#ifdef USE_ROCM +#include "contrib_ops/rocm/bert/elementwise.h" +#endif +#ifdef USE_CUDA +#include "contrib_ops/cuda/bert/transformer_common.h" +#endif namespace onnxruntime { namespace contrib { @@ -31,8 +36,10 @@ using namespace ONNX_NAMESPACE; template FastGelu::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) { +#ifdef USE_CUDA const TransformerOptions* options = TransformerOptions::GetInstance(); use_half2_ = !options->DisableHalf2(); +#endif } template @@ -50,6 +57,14 @@ Status FastGelu::ComputeInternal(OpKernelContext* context) const { int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size(); typedef typename ToCudaType::MappedType CudaT; +#ifdef USE_ROCM + return LaunchElementwiseKernel( + GetTuningContext(), context->GetComputeStream(), + reinterpret_cast(input->Data()), static_cast(input_length), + (nullptr != bias) ? reinterpret_cast(bias->Data()) : nullptr, static_cast(bias_length), + reinterpret_cast(output->MutableData())); +#endif +#ifdef USE_CUDA return LaunchFastGeluKernel(GetDeviceProp(), Stream(context), static_cast(input_length), @@ -58,6 +73,7 @@ Status FastGelu::ComputeInternal(OpKernelContext* context) const { (nullptr != bias) ? reinterpret_cast(bias->Data()) : nullptr, reinterpret_cast(output->MutableData()), use_half2_); +#endif } } // namespace cuda diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h index 3e642a70afef5..d563556593e6e 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h +++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h @@ -18,7 +18,7 @@ class FastGelu final : public CudaKernel { Status ComputeInternal(OpKernelContext* ctx) const override; private: - bool use_half2_; + bool use_half2_; // Only applicable to CUDA kernel (not ROCM). }; } // namespace cuda diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc deleted file mode 100644 index 9cb414e4e8980..0000000000000 --- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "contrib_ops/rocm/bert/fast_gelu.h" - -#include "core/providers/rocm/rocm_common.h" -#include "core/providers/rocm/miopen_common.h" -#include "contrib_ops/cpu/bert/bias_gelu_helper.h" -#include "contrib_ops/rocm/bert/elementwise.h" -#include "contrib_ops/rocm/bert/transformer_common.h" - -namespace onnxruntime { -namespace contrib { -namespace rocm { - -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - FastGelu, \ - kMSDomain, \ - 1, \ - T, \ - kRocmExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - FastGelu); - -REGISTER_KERNEL_TYPED(float) -REGISTER_KERNEL_TYPED(MLFloat16) -REGISTER_KERNEL_TYPED(BFloat16) - -using namespace ONNX_NAMESPACE; - -template -Status FastGelu::ComputeInternal(OpKernelContext* context) const { - ORT_RETURN_IF_ERROR(bias_gelu_helper::CheckInputs(context)); - - const Tensor* input = context->Input(0); - const Tensor* bias = context->Input(1); - Tensor* output = context->Output(0, input->Shape()); - - int64_t input_length = input->Shape().Size(); - if (input_length == 0) { - return Status::OK(); - } - int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size(); - typedef typename ToHipType::MappedType HipT; - - const HipT* input_buffer = reinterpret_cast(input->Data()); - const HipT* bias_buffer = (nullptr != bias) ? reinterpret_cast(bias->Data()) : nullptr; - return LaunchElementwiseKernel( - GetTuningContext(), context->GetComputeStream(), - input_buffer, static_cast(input_length), - bias_buffer, static_cast(bias_length), - reinterpret_cast(output->MutableData())); -} - -} // namespace rocm -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h deleted file mode 100644 index 42bfe5a0b0246..0000000000000 --- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/common/common.h" -#include "core/providers/rocm/rocm_kernel.h" - -namespace onnxruntime { -namespace contrib { -namespace rocm { - -using namespace onnxruntime::rocm; - -template -class FastGelu final : public RocmKernel { - public: - FastGelu(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {} - Status ComputeInternal(OpKernelContext* ctx) const override; -}; - -} // namespace rocm -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index 813fdc54ecd0d..48e4617b33b4d 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -1035,6 +1035,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu); #if !defined(DISABLE_FLOAT8_TYPES) class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN); @@ -2562,6 +2563,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc new file mode 100644 index 0000000000000..d55973eda180f --- /dev/null +++ b/onnxruntime/core/providers/cpu/tensor/gelu.cc @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/common.h" +#include "core/common/narrow.h" +#include "core/framework/op_kernel.h" +#include "core/util/math_cpuonly.h" +#include "core/mlas/inc/mlas.h" + +#include "core/platform/threadpool.h" +#include +#include "core/providers/cpu/element_wise_ranged_transform.h" +#include "core/providers/cpu/tensor/gelu.h" + +using onnxruntime::narrow; +using namespace onnxruntime::common; + +namespace onnxruntime { + +// May revisit the implementations to support inplace computation, if needed. + +ONNX_CPU_OPERATOR_KERNEL( + Gelu, + 20, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Gelu); + +#ifndef DISABLE_CONTRIB_OPS +namespace contrib { +ONNX_OPERATOR_KERNEL_EX( + Gelu, + kMSDomain, + 1, + kCpuExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Gelu); +} +#endif + +template +Status Gelu::Compute(OpKernelContext* context) const { + const Tensor* input = context->Input(0); + const T* input_data = input->Data(); + + Tensor* output = context->Output(0, input->Shape()); + T* output_data = output->MutableData(); + + concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); + int64_t elem_count = input->Shape().Size(); + constexpr int64_t length_per_task = 4096; // this number comes from FastGelu. + int64_t task_count = (elem_count + length_per_task - 1) / length_per_task; + + if (approximation_algorithm_ == "tanh") { + // FastGelu allows optional bias. Here we split input data into chunks. Each chunk + // has N elements (except the last chunk), and use thread pool to parallel chunks. + // N = 4096 is selected based on performance test results on input shape 1x128x768. + // FastGelu uses approximation for Gelu. The formula is 0.5 * (1 + Tanh(x * (C * x * x + B))) * x. + static constexpr float B = 0.7978845608028654f; // sqrt(2.0 / M_PI) + static constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0 / M_PI) + + concurrency::ThreadPool::TryBatchParallelFor( + tp, static_cast(task_count), + [&](ptrdiff_t task_idx) { + const auto start = task_idx * length_per_task; + const T* p_input = input_data + start; + T* p_output = output_data + start; + int64_t count = std::min(length_per_task, elem_count - start); + + for (int64_t i = 0; i < count; i++) { + T value = p_input[i]; + p_output[i] = value * (static_cast(C) * value * value + static_cast(B)); + } + + MlasComputeTanh(p_output, p_output, narrow(count)); + + for (int64_t i = 0; i < count; i++) { + p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f); + } + }, + 0); + return Status::OK(); + } else if (approximation_algorithm_ == "none") { + concurrency::ThreadPool::TryBatchParallelFor( + tp, static_cast(task_count), + [&](ptrdiff_t task_idx) { + const auto start = task_idx * length_per_task; + const T* p_input = input_data + start; + T* p_output = output_data + start; + int64_t count = std::min(length_per_task, elem_count - start); + + for (int64_t i = 0; i < count; i++) { + T value = p_input[i]; + p_output[i] = value * static_cast(M_SQRT1_2); + } + + MlasComputeErf(p_output, p_output, narrow(count)); + + for (int64_t i = 0; i < count; i++) { + p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f); + } + }, + 0); + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.h b/onnxruntime/core/providers/cpu/tensor/gelu.h new file mode 100644 index 0000000000000..13238028d878a --- /dev/null +++ b/onnxruntime/core/providers/cpu/tensor/gelu.h @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +namespace onnxruntime { + +template +class Gelu final : public OpKernel { + public: + explicit Gelu(const OpKernelInfo& info) : OpKernel(info) { + approximation_algorithm_ = info.GetAttrOrDefault("approximate", "none"); + } + Status Compute(OpKernelContext* ctx) const override; + + private: + std::string approximation_algorithm_; +}; + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 0dd568c5ecc05..be2530aec49fa 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1329,6 +1329,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, S class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape); #endif +// Opset 20 +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu); + template <> KernelCreateInfo BuildKernelCreateInfo() { return {}; @@ -2222,6 +2227,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + + // Opset 20 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif }; diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.cc b/onnxruntime/core/providers/cuda/tensor/gelu.cc new file mode 100644 index 0000000000000..67b2fad373a7f --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/gelu.cc @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/cudnn_common.h" +#include "core/providers/cuda/tensor/gelu.h" +#include "core/providers/cuda/tensor/gelu_impl.h" + +namespace onnxruntime { +namespace cuda { + +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + Gelu, \ + kOnnxDomain, \ + 20, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .MayInplace(0, 0), \ + Gelu); + +REGISTER_KERNEL_TYPED(float) +REGISTER_KERNEL_TYPED(MLFloat16) +REGISTER_KERNEL_TYPED(double) + +template +Status Gelu::ComputeInternal(OpKernelContext* context) const { + const Tensor* input = context->Input(0); + const auto& input_dims = input->Shape().GetDims(); + if (input_dims.size() < 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 0 is expected to have 1 or more dimensions, got ", input_dims.size()); + } + + Tensor* output = context->Output(0, input->Shape()); + + int64_t input_length = input->Shape().Size(); + if (input_length == 0) { + return Status::OK(); + } + + typedef typename ToCudaType::MappedType CudaT; + + if (approximation_algorithm_ == "tanh") { + return LaunchFastGeluKernel(GetDeviceProp(), + Stream(context), + static_cast(input_length), + 0 /* no bias */, + reinterpret_cast(input->Data()), + nullptr /* no bias */, + reinterpret_cast(output->MutableData()), + use_half2_); + } else if (approximation_algorithm_ == "none") { + return LaunchGeluKernel(Stream(context), + reinterpret_cast(input->Data()), + reinterpret_cast(output->MutableData()), + static_cast(input_length)); + } + + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_); +} + +} // namespace cuda + +#ifndef DISABLE_CONTRIB_OPS +namespace contrib::cuda { +#define REGISTER_CONTRIB_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + Gelu, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .MayInplace(0, 0), \ + onnxruntime::cuda::Gelu); + +REGISTER_CONTRIB_KERNEL_TYPED(float) +REGISTER_CONTRIB_KERNEL_TYPED(MLFloat16) +REGISTER_CONTRIB_KERNEL_TYPED(double) + +#undef REGISTER_CONTRIB_KERNEL_TYPED +} // namespace contrib::cuda +#endif + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.h b/onnxruntime/core/providers/cuda/tensor/gelu.h new file mode 100644 index 0000000000000..1c8189ab24121 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/gelu.h @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/providers/cuda/cuda_kernel.h" +#include "core/providers/cuda/math/unary_elementwise_ops.h" + +namespace onnxruntime { +namespace cuda { + +template +class Gelu final : public UnaryElementwise { + public: + Gelu(const OpKernelInfo& info) : UnaryElementwise(info) { + approximation_algorithm_ = info.GetAttrOrDefault("approximate", "none"); + } + + Status ComputeInternal(OpKernelContext* ctx) const override; + + private: + const bool use_half2_{true}; + + std::string approximation_algorithm_; +}; + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu similarity index 88% rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu rename to onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu index c9498eb1bcd7b..3292650584de8 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu @@ -24,12 +24,9 @@ limitations under the License. #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/cu_inc/common.cuh" #include "core/providers/cuda/shared_inc/cuda_call.h" -#include "contrib_ops/cuda/bert/fast_gelu_impl.h" - -using namespace onnxruntime::cuda; +#include "core/providers/cuda/tensor/gelu_impl.h" namespace onnxruntime { -namespace contrib { namespace cuda { // constants for approximating the normal cdf @@ -75,6 +72,17 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int return CUDA_CALL(cudaGetLastError()); } +template <> +Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length, + const double* input, const double* bias, double* output, bool /*use_half2*/) { + constexpr int blockSize = 256; + const int gridSize = (input_length + blockSize - 1) / blockSize; + FastGeluKernel<<>>(A, B, C, input_length, bias_length, + input, bias, output); + + return CUDA_CALL(cudaGetLastError()); +} + template <> Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length, const half* input, const half* bias, half* output, bool use_half2) { @@ -114,5 +122,4 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int } } // namespace cuda -} // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu new file mode 100644 index 0000000000000..3f96da38b37bb --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include "core/providers/cuda/tensor/gelu_impl.h" +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/cu_inc/unary_elementwise_impl.cuh" + +namespace onnxruntime { +namespace cuda { + +template +struct OP_Gelu { + __device__ __inline__ T operator()(const T& a) const { + return _Gelu(a); + } +}; + +template <> +struct OP_Gelu { + __device__ __inline__ half operator()(const half& a) const { + return static_cast(_Gelu(static_cast(a))); + } +}; + +template +Status LaunchGeluKernel( + cudaStream_t stream, + const T* input_data, + T* output_data, + size_t count) { + UnaryElementWiseImpl(stream, input_data, output_data, OP_Gelu(), count); + + return CUDA_CALL(cudaGetLastError()); +} + +#define SPECIALIZED_GELU_IMPL(T) \ + template Status LaunchGeluKernel(cudaStream_t stream, const T* input_data, T* output_data, \ + size_t count); + +SPECIALIZED_GELU_IMPL(float); +SPECIALIZED_GELU_IMPL(half); +SPECIALIZED_GELU_IMPL(double); + +#undef SPECIALIZED_GELU_IMPL + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h similarity index 80% rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h rename to onnxruntime/core/providers/cuda/tensor/gelu_impl.h index ba78310f5dfc2..2ea0d3441fda3 100644 --- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h @@ -1,17 +1,18 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - #pragma once + #include "core/common/common.h" namespace onnxruntime { -namespace contrib { namespace cuda { +template +Status LaunchGeluKernel(cudaStream_t stream, const T* input, T* output, size_t count); + template Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length, const T* input, const T* bias, T* output, bool use_half2); } // namespace cuda -} // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc index b1e54ec605a39..2a56991ec5af4 100644 --- a/onnxruntime/test/contrib_ops/activation_op_test.cc +++ b/onnxruntime/test/contrib_ops/activation_op_test.cc @@ -22,7 +22,8 @@ namespace test { TEST_F(ActivationOpTest, ThresholdedRelu_version_1_to_9) { float alpha = 0.1f; TestActivationOp( - "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, true, 1); + "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, {}, + true, 1); } TEST_F(ActivationOpTest, ScaledTanh) { @@ -46,13 +47,13 @@ TEST_F(ActivationOpTest, ParametricSoftplus) { else return alpha * logf(expf(bx) + 1); }, - {{"alpha", alpha}, {"beta", beta}}, false); // Disable TensorRT due to result mismatch + {{"alpha", alpha}, {"beta", beta}}, {}, false); // Disable TensorRT due to result mismatch } TEST_F(ActivationOpTest, Gelu) { TestActivationOp( "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast(M_SQRT1_2))); }, {}, - false, 1, kMSDomain); + {}, false, 1, kMSDomain); } #if defined(USE_DNNL) @@ -115,7 +116,7 @@ TEST_F(ActivationOpTest, QuickGelu) { y = tmp >= 0 ? y : 1 - y; return x * y; }, - {{"alpha", alpha}}, false, 1, kMSDomain); + {{"alpha", alpha}}, {}, false, 1, kMSDomain); } // Silu = x*sigmoid(x), i.e., alpha = 1.0f. @@ -129,7 +130,7 @@ TEST_F(ActivationOpTest, QuickGelu) { y = tmp >= 0 ? y : 1 - y; return x * y; }, - {{"alpha", alpha}}, false, 1, kMSDomain); + {{"alpha", alpha}}, {}, false, 1, kMSDomain); } // Negative alpha. @@ -143,7 +144,7 @@ TEST_F(ActivationOpTest, QuickGelu) { y = tmp >= 0 ? y : 1 - y; return x * y; }, - {{"alpha", alpha}}, false, 1, kMSDomain); + {{"alpha", alpha}}, {}, false, 1, kMSDomain); } } diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc index cf859facf4765..69ee72996365e 100644 --- a/onnxruntime/test/onnx/microbenchmark/activation.cc +++ b/onnxruntime/test/onnx/microbenchmark/activation.cc @@ -11,6 +11,7 @@ #include "core/framework/node_index_info.h" #include "core/framework/execution_frame.h" #include "contrib_ops/cpu/activations.h" +#include "core/providers/cpu/tensor/gelu.h" #include "core/providers/cpu/activation/activations.h" #include #include @@ -182,7 +183,7 @@ static void RunSingleNode(const std::string& op_name, const std::string& domain, } static void BM_GeluCompute(benchmark::State& state) { - RunSingleNode>("Gelu", kMSDomain, {}, state); + RunSingleNode>("Gelu", kMSDomain, {}, state); } BENCHMARK(BM_GeluCompute) diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index ddb0a6620619c..acd513172f95d 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -116,13 +116,13 @@ TEST_F(ActivationOpTest, Relu) { "Relu", input_values_double, [](double x) { return std::max(x, 0.0); }, - {}, + {}, {}, /*is_tensorrt_supported=*/false); TestActivationOp( "Relu", input_values_int8, [](int8_t x) { return std::max(x, static_cast(0)); }, - {}, + {}, {}, /*is_tensorrt_supported=*/false, /*opset_version= */ 14); #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -133,7 +133,7 @@ TEST_F(ActivationOpTest, Relu) { if (x.ToFloat() > 0.0f) return x; return MLFloat16(); }, - {}, + {}, {}, /*is_tensorrt_supported=*/false, /*opset_version= */ 11); #endif // MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -402,7 +402,7 @@ TEST_F(ActivationOpTest, Celu) { // TODO: Investigate why gcc 4 fails to compile without the explicit cast [alpha](float x) { return std::max(0.0f, x) + std::min(0.0f, alpha * (static_cast(exp(x / alpha)) - 1)); }, // Disable on TensorRT as it seems like it doesn't yet support Celu - {{"alpha", alpha}}, false, 12); + {{"alpha", alpha}}, {}, false, 12); } TEST_F(ActivationOpTest, LeakyRelu) { @@ -410,7 +410,7 @@ TEST_F(ActivationOpTest, LeakyRelu) { TestActivationOp("LeakyRelu", input_values, [alpha](float x) { return (x >= 0) ? x : alpha * x; }, - {{"alpha", alpha}}); + {{"alpha", alpha}}, {}); } #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -442,7 +442,7 @@ TEST_F(ActivationOpTest, ThresholdedRelu) { "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, - {{"alpha", alpha}}, true, 10); + {{"alpha", alpha}}, {}, true, 10); } TEST_F(ActivationOpTest, Selu) { @@ -452,7 +452,7 @@ TEST_F(ActivationOpTest, Selu) { TestActivationOp("Selu", input_values, [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, - {{"alpha", alpha}, {"gamma", gamma}}); + {{"alpha", alpha}, {"gamma", gamma}}, {}); } TEST_F(ActivationOpTest, Selu_Attributes) { @@ -462,7 +462,7 @@ TEST_F(ActivationOpTest, Selu_Attributes) { TestActivationOp("Selu", input_values, [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, - {{"alpha", alpha}, {"gamma", gamma}}); + {{"alpha", alpha}, {"gamma", gamma}}, {}); } TEST_F(ActivationOpTest, Selu_GH10726) { @@ -472,7 +472,7 @@ TEST_F(ActivationOpTest, Selu_GH10726) { TestActivationOp("Selu", {{1.f, -1.f}}, [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; }, - {{"alpha", alpha}, {"gamma", gamma}}); + {{"alpha", alpha}, {"gamma", gamma}}, {}); } TEST_F(ActivationOpTest, PRelu) { @@ -625,7 +625,7 @@ TEST_F(ActivationOpNoInfTest, Softsign) { return result; }, - {}, false); // Disable TensorRT because result mismatches + {}, {}, false); // Disable TensorRT because result mismatches } #if defined(ENABLE_TRAINING_OPS) @@ -695,5 +695,33 @@ TEST(LeakyReluGradInferenceTest, Basic) { } #endif +// Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation. +// It will run the DNNL Gelu op which only be part of standard of Gelu-20 op. +#if !defined(USE_DNNL) && !defined(USE_QNN) +TEST_F(ActivationOpTest, ONNX_Gelu) { + TestActivationOp( + "Gelu", + input_values, + [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); }, {}, + {{"approximate", "none"}}, true, 20); + + TestActivationOp( + "Gelu", + input_values, + [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); }, + {}, + {/*default value of approximate attribute is none */}, true, 20); + + TestActivationOp( + "Gelu", + input_values, + [](float x) { + return 0.5 * x * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * x * x * x))); + }, + {}, + {{"approximate", "tanh"}}, true, 20); +} +#endif + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h index b5ec1402584fb..984b8f4437a3b 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h @@ -17,13 +17,16 @@ namespace test { template inline void TestActivationOp(const char* szOp, const std::vector>& input_vals_vec, std::function expected_func, - const std::unordered_map attribs = {}, + const std::unordered_map float_attribs = {}, + const std::unordered_map string_attribs = {}, bool is_tensorrt_supported = true, int opset_version = 7, const char* domain = kOnnxDomain) { for (const std::vector& input_vals : input_vals_vec) { OpTester test(szOp, opset_version, domain); - for (auto attr : attribs) test.AddAttribute(attr.first, attr.second); + for (auto attr : float_attribs) test.AddAttribute(attr.first, attr.second); + for (auto attr : string_attribs) test.AddAttribute(attr.first, attr.second); + std::vector dims{(int64_t)input_vals.size()}; std::vector expected_vals; From 5e432a3ae69dbbed603420493c52ba48b3726471 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Fri, 23 Feb 2024 04:47:15 +0100 Subject: [PATCH 37/68] Add support for NHWC GridSample in the CUDA EP and enable grid_sample_test for all EPs (#19562) I've added NHWC GridSample support to the CUDA EP to reduce the number of layout transforms. Also I've enabled the full set of GridSampleTests for all EPs. I've also added the GridSample OpSet 16 to the registered kernels. ### Motivation and Context This is the first PR is a series of enhancements of the CUDA EP improving NHWC support to avoid costly layout transforms between NWHC and NCHW nodes which are layout sensitive. Also testing was quite rudimentary for the CUDA EP while it was great for the CPU path. I've regenerated grid_sample_test.cc enabling tests for other platforms as well. Those tests resurfaced #10607 again which is fixed as well. --- docs/OperatorKernels.md | 1 + .../contrib_ops/cuda/cuda_contrib_kernels.cc | 7 + onnxruntime/contrib_ops/cuda/grid_sample.cc | 35 ++-- onnxruntime/contrib_ops/cuda/grid_sample.h | 2 +- .../contrib_ops/cuda/grid_sample_impl.cu | 101 ++++++---- .../contrib_ops/cuda/grid_sample_impl.h | 2 +- .../layout_transformation.cc | 2 + .../providers/cuda/cuda_execution_provider.cc | 2 + .../providers/cuda/shared_inc/cuda_utils.h | 26 +++ .../providers/cpu/tensor/grid_sample_test.cc | 172 ++++++++---------- .../cpu/tensor/grid_sample_test_gen.py | 2 +- onnxruntime/test/util/default_providers.cc | 16 ++ .../test/util/include/default_providers.h | 3 + 13 files changed, 223 insertions(+), 148 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 46149c577a106..b0ed68d595c42 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -619,6 +619,7 @@ Do not modify directly.* |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)| |GreaterOrEqual|*in* A:**T**
*in* B:**T**
*out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)
**T1** = tensor(bool)| |||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)
**T1** = tensor(bool)| +|GridSample|*in* X:**T1**
*in* grid:**T2**
*out* Y:**T1**|16+|**T1** = tensor(float)
**T2** = tensor(float)| |HardSigmoid|*in* X:**T**
*out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)| |Identity|*in* input:**T**
*out* output:**T**

or

*in* input:**V**
*out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc index be8c0dc86c135..57e951d3a68ff 100644 --- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc @@ -203,6 +203,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze); #endif +#ifdef ENABLE_CUDA_NHWC_OPS +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample); +#endif + template <> KernelCreateInfo BuildKernelCreateInfo() { KernelCreateInfo info; @@ -408,6 +412,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, #endif +#ifdef ENABLE_CUDA_NHWC_OPS + BuildKernelCreateInfo, +#endif }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/contrib_ops/cuda/grid_sample.cc index 4c2999c279e0a..2500de39d3536 100644 --- a/onnxruntime/contrib_ops/cuda/grid_sample.cc +++ b/onnxruntime/contrib_ops/cuda/grid_sample.cc @@ -9,22 +9,23 @@ namespace onnxruntime { namespace contrib { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ +#define REGISTER_KERNEL_TYPED(T, VERSION, LAYOUT, DOMAIN) \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ GridSample, \ - kMSDomain, \ - 1, \ + DOMAIN, \ + VERSION, \ T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()) \ .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - GridSample); + onnxruntime::contrib::cuda::GridSample); -REGISTER_KERNEL_TYPED(float) +REGISTER_KERNEL_TYPED(float, 1, LAYOUT_NCHW, kMSDomain) +REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NHWC, kMSInternalNHWCDomain) -template -GridSample::GridSample(const OpKernelInfo& info) : CudaKernel(info) { +template +GridSample::GridSample(const OpKernelInfo& info) : CudaKernel(info) { std::string mode_str = info.GetAttrOrDefault("mode", "bilinear"); std::string padding_mode_str = info.GetAttrOrDefault("padding_mode", "zeros"); align_corners_ = static_cast(info.GetAttrOrDefault("align_corners", 0)); @@ -48,8 +49,8 @@ GridSample::GridSample(const OpKernelInfo& info) : CudaKernel(info) { } } -template -Status GridSample::ComputeInternal(OpKernelContext* context) const { +template +Status GridSample::ComputeInternal(OpKernelContext* context) const { const Tensor* X = context->Input(0); const auto& dims_input = X->Shape().GetDims(); const Tensor* Grid = context->Input(1); @@ -61,11 +62,13 @@ Status GridSample::ComputeInternal(OpKernelContext* context) const { ORT_ENFORCE(dims_grid[0] == dims_input[0], "Grid batch size ", dims_grid[0], " does not match input batch size ", dims_input[0]); ORT_ENFORCE(dims_grid[3] == 2, "Last dimension of grid: ", dims_grid[3], ", expect 2"); + using Ch = Channels; + TensorShapeVector dims_output(4); - dims_output[0] = dims_input[0]; - dims_output[1] = dims_input[1]; - dims_output[2] = dims_grid[1]; - dims_output[3] = dims_grid[2]; + dims_output[Ch::N] = dims_input[Ch::N]; + dims_output[Ch::C] = dims_input[Ch::C]; + dims_output[Ch::H] = dims_grid[1 /* Grid::H */]; + dims_output[Ch::W] = dims_grid[2 /* Grid::W */]; Tensor* Y = context->Output(0, dims_output); // Return early if the output tensor is going to be of size 0 if (Y->Shape().Size() == 0) { @@ -74,7 +77,7 @@ Status GridSample::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; CudaT* Y_data = reinterpret_cast(Y->MutableData()); - GridSampleImpl( + GridSampleImpl( Stream(context), reinterpret_cast(X->Data()), reinterpret_cast(Grid->Data()), @@ -89,4 +92,8 @@ Status GridSample::ComputeInternal(OpKernelContext* context) const { } } // namespace cuda } // namespace contrib + +namespace cuda { +REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NCHW, kOnnxDomain) +} // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/contrib_ops/cuda/grid_sample.h index 08ca58c7cc458..16581bfe77482 100644 --- a/onnxruntime/contrib_ops/cuda/grid_sample.h +++ b/onnxruntime/contrib_ops/cuda/grid_sample.h @@ -12,7 +12,7 @@ namespace cuda { using namespace onnxruntime::cuda; -template +template class GridSample final : public CudaKernel { public: explicit GridSample(const OpKernelInfo& info); diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu index 8a391eca7e86a..b23da635bc83d 100644 --- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu +++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu @@ -50,28 +50,34 @@ __device__ T GsReflect(T x, float x_min, float x_max) { return static_cast(fx); } -template +template __device__ T PixelAtGrid(const T* input_data, int64_t bIdx, int64_t cIdx, int64_t y, int64_t x, - int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) { + int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) { T pixel = 0.0f; + + auto PixelOffset = [bIdx, cIdx, C, H, W](int64_t x, int64_t y) -> int64_t { + return Layout == LAYOUT_NCHW + ? (bIdx * C * H * W + cIdx * H * W + y * W + x) + : (bIdx * H * W * C + y * W * C + x * C + cIdx); + }; + if (padding_mode == 0) { // zeros if (x >= 0 && x < W && y >= 0 && y < H) { - pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x]; + pixel = input_data[PixelOffset(x, y)]; } - } else if (padding_mode == 1) { //border + } else if (padding_mode == 1) { // border x = max((int64_t)0, min((int64_t)W - 1, (int64_t)x)); y = max((int64_t)0, min((int64_t)H - 1, (int64_t)y)); - pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x]; + pixel = input_data[PixelOffset(x, y)]; } else { // Reflection - x = (int64_t) GsReflect(x, border[0], border[2]); - y = (int64_t) GsReflect(y, border[1], border[3]); - pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x]; + x = (int64_t)GsReflect(x, border[0], border[2]); + y = (int64_t)GsReflect(y, border[1], border[3]); + pixel = input_data[PixelOffset(x, y)]; } return pixel; } -__device__ void GsGetCubicCoeffs(float x, float coeffs[4]) -{ +__device__ void GsGetCubicCoeffs(float x, float coeffs[4]) { float cubic_alpha = -0.75f; x = abs(x); coeffs[0] = (((cubic_alpha * (x + 1) - 5 * cubic_alpha) * (x + 1) + 8 * cubic_alpha) * (x + 1) - 4 * cubic_alpha); @@ -93,7 +99,7 @@ __device__ T GsBicubicInterpolate(T p[4][4], float x, float y) { return pixel; } -template +template __global__ void _GridSampleKernel( const T* input_data, const T* grid_data, @@ -110,16 +116,32 @@ __global__ void _GridSampleKernel( { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(idx, N * C * H_out * W_out); // extract batch index, channel index, y index, x index for current thread - int BIdx = idx / (C * H_out * W_out ); - int tmpBCnt = BIdx * (C * H_out * W_out); + int BIdx, yIdx, xIdx, cIdx; + if constexpr (Layout == LAYOUT_NCHW) { + BIdx = idx / (C * H_out * W_out); + int tmpBCnt = BIdx * (C * H_out * W_out); + + cIdx = (idx - tmpBCnt) / (H_out * W_out); + int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out); - int cIdx = (idx - tmpBCnt) / (H_out * W_out); - int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out); + yIdx = (idx - tmpCCnt) / W_out; + int tmpHCnt = tmpCCnt + yIdx * W_out; - int yIdx = (idx - tmpCCnt) / W_out; - int tmpHCnt = tmpCCnt + yIdx * W_out; + xIdx = (idx - tmpHCnt); + } else { + static_assert(Layout == LAYOUT_NHWC, "Unsupported layout"); - int xIdx = (idx - tmpHCnt); + BIdx = idx / (H_out * W_out * C); + int tmpBCnt = BIdx * (H_out * W_out * C); + + yIdx = (idx - tmpBCnt) / (W_out * C); + int tmpHCnt = tmpBCnt + yIdx * (W_out * C); + + xIdx = (idx - tmpHCnt) / C; + int tmpWCnt = tmpHCnt + xIdx * C; + + cIdx = (idx - tmpWCnt); + } int grid_idx = BIdx * H_out * W_out + yIdx * W_out + xIdx; T grid_X = grid_data[grid_idx * 2 + 0]; @@ -147,8 +169,9 @@ __global__ void _GridSampleKernel( if (grid_x_imgSpace < x_min || grid_x_imgSpace > x_max || grid_y_imgSpace < y_min || grid_y_imgSpace > y_max) { // out of bound if (padding_mode == 1) { // border - grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f)); - grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f)); + // Clamping must not be done here, see #10607 + // grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f)); + // grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f)); } else if (padding_mode == 2) { // reflection grid_x_imgSpace = GsReflect(grid_x_imgSpace, x_min, x_max); grid_y_imgSpace = GsReflect(grid_y_imgSpace, y_min, y_max); @@ -175,10 +198,10 @@ __global__ void _GridSampleKernel( w_lb = w_b * w_l; w_rb = w_b * w_r; - T lt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border); - T rt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border); - T lb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border); - T rb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border); + T lt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border); + T rt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border); + T lb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border); + T rb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border); T interpoV = w_lt * lt_v + w_rt * rt_v + w_lb * lb_v + w_rb * rb_v; output_data[outIdx] = interpoV; return; @@ -186,7 +209,8 @@ __global__ void _GridSampleKernel( if (mode == 1) { // nearest int x_n = grid_x_imgSpace; int y_n = grid_y_imgSpace; - output_data[outIdx] = PixelAtGrid(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border); + output_data[outIdx] = + PixelAtGrid(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border); return; } if (mode == 2) { // bicubic @@ -195,7 +219,8 @@ __global__ void _GridSampleKernel( T p[4][4] = {}; // [H][W] for (int64_t h = 0; h < 4; h++) { for (int64_t w = 0; w < 4; w++) { - p[h][w] = PixelAtGrid(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border); + p[h][w] = + PixelAtGrid(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border); } } T dx = grid_x_imgSpace - x0 - 1; @@ -204,7 +229,7 @@ __global__ void _GridSampleKernel( } } -template +template void GridSampleImpl( cudaStream_t stream, const T* input_data, @@ -216,17 +241,23 @@ void GridSampleImpl( const int64_t H_out, const int64_t W_out, T* output_data) { - int blocksPerGrid = (int)(ceil(static_cast(dims[0] * dims[1] * H_out * W_out) / GridDim::maxThreadsPerBlock)); - _GridSampleKernel<<>>( - input_data, grid_data, mode, padding_mode, align_corners, dims[0], dims[1], dims[2], dims[3], H_out, W_out, output_data); + using Ch = Channels; + + int blocksPerGrid = static_cast( + ceil(static_cast(dims[Ch::N] * dims[Ch::C] * H_out * W_out) / GridDim::maxThreadsPerBlock)); + _GridSampleKernel<<>>( + input_data, grid_data, mode, padding_mode, align_corners, + dims[Ch::N], dims[Ch::C], dims[Ch::H], dims[Ch::W], + H_out, W_out, output_data); } -#define SPECIALIZED_IMPL(T) \ - template void GridSampleImpl(cudaStream_t stream, const T* input_data, const T* grid_data, \ - const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \ - const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data); +#define SPECIALIZED_IMPL(T, IsNHWC) \ + template void GridSampleImpl(cudaStream_t stream, const T* input_data, const T* grid_data, \ + const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \ + const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data); -SPECIALIZED_IMPL(float) +SPECIALIZED_IMPL(float, false) // NCHW +SPECIALIZED_IMPL(float, true) // NHWC } // namespace cuda } // namespace contrib diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h index 6df86ce161908..62cd66a48fa84 100644 --- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h +++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h @@ -8,7 +8,7 @@ namespace onnxruntime { namespace contrib { namespace cuda { -template +template void GridSampleImpl( cudaStream_t stream, const T* input_data, diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc index 4505d4afdf1e0..a8717b99a8750 100644 --- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc +++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc @@ -31,6 +31,7 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a } #if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS +// TODO(mtavenrath) generate list from registered kernels using nhwc domain const std::unordered_set& GetCUDALayoutSensitiveOps() { static std::unordered_set cuda_nhwc_ops = []() { return std::unordered_set{ @@ -41,6 +42,7 @@ const std::unordered_set& GetCUDALayoutSensitiveOps() { "MaxPool", "GlobalAveragePool", "AveragePool", + "GridSample", }; }(); return cuda_nhwc_ops; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index be2530aec49fa..00783bcbc2665 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1256,6 +1256,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample); // Opset 17 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization); @@ -2148,6 +2149,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, // Opset 17 BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index fa987866c002f..54c024793ff0b 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -168,5 +168,31 @@ struct NumericLimits { } }; +// TODO Where to put this? good places might be +// core/framework/tensor_shape.h +// core/util/matrix_layout.h + +constexpr bool LAYOUT_NCHW = false; +constexpr bool LAYOUT_NHWC = true; + +template +struct Channels; + +template <> +struct Channels { + static constexpr size_t N = 0; + static constexpr size_t H = 1; + static constexpr size_t W = 2; + static constexpr size_t C = 3; +}; + +template <> +struct Channels { + static constexpr size_t N = 0; + static constexpr size_t C = 1; + static constexpr size_t H = 2; + static constexpr size_t W = 3; +}; + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc index 0f097622abff0..5c89d6ea7bd75 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc @@ -6,6 +6,33 @@ namespace onnxruntime { namespace test { + +std::vector> GetExecutionProviders(int opset_version) { + ORT_UNUSED_PARAMETER(opset_version); + + std::vector> execution_providers; + + execution_providers.emplace_back(DefaultCpuExecutionProvider()); +#ifdef USE_CUDA + if (opset_version < 20) { + execution_providers.emplace_back(DefaultCudaExecutionProvider()); +#ifdef ENABLE_CUDA_NHWC_OPS + execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); +#endif + } + +#endif + return execution_providers; +} + +template +void RunTests(T& test, std::vector>&& execution_providers) { + for (size_t idx = 0; idx < execution_providers.size(); ++idx) { + test.ConfigEp(std::move(execution_providers[idx])).RunWithConfig(); + } + execution_providers.clear(); +} + // DO NOT edit following tests. They are generated by: // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) { @@ -25,8 +52,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) { @@ -46,8 +72,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) { @@ -67,8 +92,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) { @@ -88,8 +112,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) { @@ -109,8 +132,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) { @@ -130,8 +152,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) { @@ -151,8 +172,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) { @@ -172,8 +192,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) { @@ -193,8 +212,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) { @@ -214,8 +232,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) { @@ -235,8 +252,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) { @@ -256,8 +272,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) { @@ -277,8 +292,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) { @@ -298,8 +312,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) { @@ -319,8 +332,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) { @@ -340,8 +352,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) { @@ -361,8 +372,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) { @@ -382,8 +392,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(16)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) { @@ -403,8 +412,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) { @@ -424,8 +432,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) { @@ -445,8 +452,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) { @@ -466,8 +472,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) { @@ -487,8 +492,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) { @@ -508,8 +512,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) { @@ -529,8 +532,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) { @@ -550,8 +552,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) { @@ -571,8 +572,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) { @@ -592,8 +592,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) { @@ -613,8 +612,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) { @@ -634,8 +632,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) { @@ -655,8 +652,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) { @@ -676,8 +672,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) { @@ -697,8 +692,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) { @@ -718,8 +712,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) { @@ -739,8 +732,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) { @@ -760,8 +752,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) { @@ -781,8 +772,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) { @@ -802,8 +792,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) { @@ -823,8 +812,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) { @@ -844,8 +832,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) { @@ -865,8 +852,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) { @@ -886,8 +872,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) { @@ -907,8 +892,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) { @@ -928,8 +912,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) { @@ -949,8 +932,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) { @@ -970,8 +952,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) { @@ -991,8 +972,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) { test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) { @@ -1012,8 +992,8 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) test.AddAttribute("padding_mode", padding_mode); test.AddAttribute("align_corners", align_corners); test.AddOutput("Y", Y_shape, Y_data); - test.ConfigEp(DefaultCpuExecutionProvider()) - .RunWithConfig(); + RunTests(test, GetExecutionProviders(20)); } + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py index e4d58e79243ef..c60e55617774f 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py @@ -76,6 +76,6 @@ print('test.AddAttribute("padding_mode", padding_mode);') print('test.AddAttribute("align_corners", align_corners);') print('test.AddOutput("Y", Y_shape, Y_data);') - print("test.Run();") + print(f"RunTests(test, GetExecutionProviders({opset_version}));") print("}") print("\n") diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 40b40136af1af..b404c12db3582 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -8,6 +8,9 @@ #ifdef USE_COREML #include "core/providers/coreml/coreml_provider_factory.h" #endif +#if defined(ENABLE_CUDA_NHWC_OPS) +#include +#endif #include "core/session/onnxruntime_cxx_api.h" #include "core/framework/session_options.h" @@ -118,6 +121,19 @@ std::unique_ptr DefaultCudaExecutionProvider() { return nullptr; } +#ifdef ENABLE_CUDA_NHWC_OPS +std::unique_ptr DefaultCudaNHWCExecutionProvider() { +#if defined(USE_CUDA) + OrtCUDAProviderOptionsV2 provider_options{}; + provider_options.do_copy_in_default_stream = true; + provider_options.prefer_nhwc = true; + if (auto factory = CudaProviderFactoryCreator::Create(&provider_options)) + return factory->CreateProvider(); +#endif + return nullptr; +} +#endif + std::unique_ptr CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) { #ifdef USE_CUDA if (auto factory = CudaProviderFactoryCreator::Create(provider_options)) diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h index 9f78e0a0d4eb2..738fc66d775c6 100644 --- a/onnxruntime/test/util/include/default_providers.h +++ b/onnxruntime/test/util/include/default_providers.h @@ -35,6 +35,9 @@ namespace test { // unique_ptr providers with default values for session registration std::unique_ptr DefaultCpuExecutionProvider(bool enable_arena = true); std::unique_ptr DefaultCudaExecutionProvider(); +#ifdef ENABLE_CUDA_NHWC_OPS +std::unique_ptr DefaultCudaNHWCExecutionProvider(); +#endif std::unique_ptr CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options); std::unique_ptr DefaultDnnlExecutionProvider(); std::unique_ptr DnnlExecutionProviderWithOptions(const OrtDnnlProviderOptions* provider_options); From ae3d73c9818c34af42c785ff2bd9558007ba315f Mon Sep 17 00:00:00 2001 From: satyajandhyala Date: Fri, 23 Feb 2024 00:21:15 -0800 Subject: [PATCH 38/68] [JS/WebGPU] Fix Split and Where to handle corner cases. (#19613) ### Description 1. Fix Where operator to handle Boolean input less than 4 bytes. 2. Fix JSEP test harness to use tensor names consistently. ### Motivation and Context --- js/web/lib/wasm/jsep/webgpu/ops/where.ts | 3 ++- js/web/test/data/ops/where.jsonc | 34 ++++++++++++++++++++++++ js/web/test/test-runner.ts | 4 +-- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/where.ts b/js/web/lib/wasm/jsep/webgpu/ops/where.ts index cfee07a9239d7..a6375847fc42f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/where.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/where.ts @@ -27,7 +27,7 @@ const createWhereOpProgramShader = const expressionA = `a_data[index_a${x}][component_a${x}]`; const expressionB = `b_data[index_b${x}][component_b${x}]`; // eslint-disable-next-line no-bitwise - const expressionC = `bool(c_data[index_c${x}] & ${0xff000000 >>> ((3 - x) * 8)}u)`; + const expressionC = `bool(c_data[index_c${x}] & (0xffu << (component_c${x} * 8)))`; return ` let output_indices${x} = ${output.offsetToIndices(`global_idx * 4u + ${x}u`)}; let offset_a${x} = ${a.broadcastedIndicesToOffset(`output_indices${x}`, output)}; @@ -38,6 +38,7 @@ const createWhereOpProgramShader = let index_c${x} = offset_c${x} / 4u; let component_a${x} = offset_a${x} % 4u; let component_b${x} = offset_b${x} % 4u; + let component_c${x} = offset_c${x} % 4u; ${resStr}[${x}] = ${typeCast}(${expression(expressionA, expressionB, expressionC)}); `; }; diff --git a/js/web/test/data/ops/where.jsonc b/js/web/test/data/ops/where.jsonc index 047fd6fd7511b..990120dd3708e 100644 --- a/js/web/test/data/ops/where.jsonc +++ b/js/web/test/data/ops/where.jsonc @@ -168,5 +168,39 @@ ] } ] + }, + { + "name": "Where with no attributes", + "operator": "Where", + "attributes": [], + "cases": [ + { + "name": "T[1 1 2 1] T[1 4] T[1 1 2 4] float32 broadcast 1", + "inputs": [ + { + "data": [true, false], + "dims": [1, 1, 2, 1], + "type": "bool" + }, + { + "data": [1, 2, 3, 4], + "dims": [1, 4], + "type": "float32" + }, + { + "data": [5, 6, 7, 8, 9, 10, 11, 12], + "dims": [1, 1, 2, 4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [1, 2, 3, 4, 9, 10, 11, 12], + "dims": [1, 1, 2, 4], + "type": "float32" + } + ] + } + ] } ] diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index ecc7d4b4a09a5..a4adf5c4ce144 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -627,8 +627,8 @@ export async function runModelTestSet( try { const feeds: Record = {}; const outputsMetaInfo: Record = {}; - testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor); - testCase.outputs!.forEach((tensor, i) => outputsMetaInfo[context.session.outputNames[i]] = tensor); + testCase.inputs!.forEach((tensor) => feeds[tensor.name] = tensor); + testCase.outputs!.forEach((tensor) => outputsMetaInfo[tensor.name] = tensor); const [start, end, outputs] = await sessionRun({session: context.session, feeds, outputsMetaInfo, ioBinding: context.ioBinding}); if (context.perfData.count === 0) { From f4306004321efe9a0e65a19a707bf2266ffd7b16 Mon Sep 17 00:00:00 2001 From: cao lei Date: Fri, 23 Feb 2024 06:02:05 -0800 Subject: [PATCH 39/68] Enable streams for DML EP. This change is to revert PR 19481 since the bug 19480 is fixed by PR 19515 (#19609) ### Description Enable streams for DML EP. This change is to revert PR 19481 since the bug 19480 is fixed by PR 19515 ### Motivation and Context Enable streams for DML EP. This change is to revert PR 19481 since the bug 19480 is fixed by PR 19515 --- cmake/adjust_global_compile_flags.cmake | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index a56864ebf4644..8161ea574b8cc 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -92,13 +92,8 @@ if (onnxruntime_MINIMAL_BUILD) endif() endif() -# Enable stream for all the non-minimal build, except for DML. There's currently a bug -# in the allocation planner when reusing buffers and more than one streams are used that -# make it possible (although rarely) to reach a reference count of 0 for a buffer that is -# still being used. Since DML doesn't benefit from multiple streams, disabling it is the -# safest option for now. -# https://github.com/microsoft/onnxruntime/issues/19480 -if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_DML) +# Enable stream for all the non-minimal build +if (NOT onnxruntime_MINIMAL_BUILD) add_compile_definitions(ORT_ENABLE_STREAM) endif() From efbe2b84556c195e7d7f3353321eb3f410a1e645 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Fri, 23 Feb 2024 17:45:17 +0100 Subject: [PATCH 40/68] Fix cuDNN v9 build by replacing removed cuDNN v6 RNN API usage by cuDNN v8 RNN API and reenable RNN tests for CUDA EP (#19419) Replace deprecated cuDNN RNN based API by cuDNN v8 RNN API and re-enable RNN tests for the CUDA EP. ### Motivation and Context The deprecated cuDNN RNN API might vanish soon and in addition for the current CUDA EP RNN implementation all RNN tests are disabled due to failures. With this change the deprecated API has been removed and the new updated implemented doesn't fail the tests anymore. --- .../core/providers/cuda/cudnn_common.h | 4 +- .../core/providers/cuda/rnn/cudnn_rnn_base.cc | 350 +++++++++--------- .../core/providers/cuda/rnn/cudnn_rnn_base.h | 55 +-- onnxruntime/core/providers/cuda/rnn/rnn.cc | 3 +- onnxruntime/core/providers/cuda/rnn/rnn.h | 1 + .../core/providers/cuda/rnn/rnn_impl.cu | 91 +---- .../core/providers/cuda/rnn/rnn_impl.h | 14 +- .../test/providers/cpu/rnn/rnn_op_test.cc | 24 +- 8 files changed, 240 insertions(+), 302 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index fdd14dedad47e..2cbeb13696270 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -24,12 +24,12 @@ class CudnnTensor final { operator cudnnTensorDescriptor_t() const { return tensor_; } + Status CreateTensorIfNeeded(); + template static cudnnDataType_t GetDataType(); private: - Status CreateTensorIfNeeded(); - cudnnTensorDescriptor_t tensor_; }; diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc index 99c1f48e21c74..b61b104790fe5 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc @@ -9,40 +9,49 @@ namespace onnxruntime { namespace cuda { template -void CudnnRnnBase::SetWeightBias(const cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnn_desc, - const int pseudo_layer, - const cudnnTensorDescriptor_t x_desc, - const cudnnFilterDescriptor_t w_desc, - const cudnnFilterDescriptor_t filter_desc, - const void* reorganized_w_data, - const int lin_layer_id, - const T* pos, - int& offset, - bool is_matrix, - cudaStream_t cuda_stream) const { +Status CudnnRnnBase::SetWeightBias(const cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnn_desc, + const int pseudo_layer, + size_t reorganized_w_data_size, + const void* reorganized_w_data, + const int lin_layer_id, + const T* pos, + int& offset, + bool is_matrix, + cudaStream_t cuda_stream) const { int numDims; - std::vector matDims(3); + std::array matDims; + std::array strideA; cudnnDataType_t dt; - cudnnTensorFormat_t tf; T* mem_offset; - if (is_matrix) { - cudnnGetRNNLinLayerMatrixParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset); - } else { - cudnnGetRNNLinLayerBiasParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset); - } + CudnnTensor tensor_desc_matrix, tensor_desc_bias; + ORT_RETURN_IF_ERROR(tensor_desc_bias.CreateTensorIfNeeded()); + ORT_RETURN_IF_ERROR(tensor_desc_matrix.CreateTensorIfNeeded()); - cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data()); + T *mem_offset_matrix, *mem_offset_bias; + CUDNN_RETURN_IF_ERROR(cudnnGetRNNWeightParams( + handle, rnn_desc, pseudo_layer, reorganized_w_data_size, reorganized_w_data, + lin_layer_id, tensor_desc_matrix, (void**)&mem_offset_matrix, tensor_desc_bias, (void**)&mem_offset_bias)); + CUDNN_RETURN_IF_ERROR(cudnnGetTensorNdDescriptor( + is_matrix ? tensor_desc_matrix : tensor_desc_bias, 3, &dt, &numDims, matDims.data(), strideA.data())); + + mem_offset = is_matrix ? mem_offset_matrix : mem_offset_bias; int count = matDims[0] * matDims[1] * matDims[2]; + + if (strideA[0] != count) { + return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::INVALID_ARGUMENT, "Stride is not packed"); + } CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream)); + offset += count; + + return Status::OK(); } template Status CudnnRnnBase::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle, const cudnnRNNDescriptor_t rnn_desc, - const cudnnTensorDescriptor_t x_desc, - const cudnnFilterDescriptor_t w_desc, + size_t reorganized_w_data_size, void* reorganized_w_data, const T* W_data, const T* R_data, @@ -51,18 +60,22 @@ Status CudnnRnnBase::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle, int w_offset = 0; int r_offset = 0; int bias_offset = 0; - CudnnFilterDescriptor filter_desc; for (int layer = 0; layer < RNN_NUM_LAYERS * num_directions_; ++layer) { for (size_t idx = 0; idx < W_lin_layer_id_.size(); ++idx) { - SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream); + ORT_RETURN_IF_ERROR(SetWeightBias( + cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data, + W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream)); if (B_data != nullptr) { - SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream); + ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data, + W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream)); } } for (size_t idx = 0; idx < R_lin_layer_id_.size(); ++idx) { - SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream); + ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data, + R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream)); if (B_data != nullptr) { - SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream); + ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data, + R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream)); } } } @@ -72,6 +85,7 @@ Status CudnnRnnBase::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle, template Status CudnnRnnBase::ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B, + size_t& reorganized_w_data_size_in_bytes, IAllocatorUniquePtr& reorganized_w_data, CudnnFilterDescriptor& target_w_desc, CudnnRNN& rnn_desc, onnxruntime::Stream* ort_stream) const { @@ -91,19 +105,16 @@ Status CudnnRnnBase::ReorganizeWeights(const Tensor* W, const Tensor* R, cons TensorShapeVector dims_w({w_size, 1, 1}); ORT_RETURN_IF_ERROR(target_w_desc.Set(dims_w, CudnnTensor::GetDataType())); - TensorShapeVector fake_dims_x({1, input_size, 1}); - CudnnTensor fake_x_desc; - ORT_RETURN_IF_ERROR(fake_x_desc.Set(fake_dims_x, CudnnTensor::GetDataType())); - // Prepare the weight data - reorganized_w_data = GetScratchBuffer(w_size * sizeof(T), ort_stream); + reorganized_w_data_size_in_bytes = w_size * sizeof(T); + reorganized_w_data = GetScratchBuffer(reorganized_w_data_size_in_bytes, ort_stream); // In many cases, this allocation is bigger than needed, leaving part of - // the buffer unintialized. non-zero garbage data leads to wrong result + // the buffer uninitialized. non-zero garbage data leads to wrong result // in call to cudnnRNNForwardInference() // TODO! refine allocation size for each case. cudaStream_t cuda_stream = ort_stream ? static_cast(ort_stream->GetHandle()) : nullptr; - cudaMemsetAsync(reorganized_w_data.get(), 0, w_size * sizeof(T), cuda_stream); + CUDA_RETURN_IF_ERROR(cudaMemsetAsync(reorganized_w_data.get(), 0, reorganized_w_data_size_in_bytes, cuda_stream)); const T* W_data = W->Data(); const T* R_data = R->Data(); @@ -111,8 +122,9 @@ Status CudnnRnnBase::ReorganizeWeights(const Tensor* W, const Tensor* R, cons auto* ort_cuda_stream = dynamic_cast(ort_stream); cudnnHandle_t cudnn_handle = ort_cuda_stream ? ort_cuda_stream->cudnn_handle_ : DefaultCudnnHandle(); - ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc, fake_x_desc, target_w_desc, - reorganized_w_data.get(), W_data, R_data, B_data, cuda_stream)); + ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc, + reorganized_w_data_size_in_bytes, reorganized_w_data.get(), + W_data, R_data, B_data, cuda_stream)); return Status::OK(); } @@ -128,22 +140,31 @@ Status CudnnRnnBase::CacheCudnnRnnWeights(const OpKernelInfo& info) { bool get_R = info.TryGetConstantInput(RNN_Input_Index::R, &R); bool get_B = info.TryGetConstantInput(RNN_Input_Index::B, &B); + bool has_bias = B != nullptr; + if (get_W && get_R) { CudnnRNN tmp_rnn_desc; - ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(DefaultCudnnHandle(), + auto proj_size = hidden_size_; + ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(W->Shape()[2], // input_size hidden_size_, + proj_size, RNN_NUM_LAYERS, cudnn_dropout_desc_, cudnn_direction_mode_, rnn_mode_, - CudnnTensor::GetDataType(), - GetDeviceProp())); + has_bias, + CudnnTensor::GetDataType())); if (get_B) { - ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr)); + ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B, + w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_, + tmp_rnn_desc, nullptr)); } else { - ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr)); + ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr, + w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_, + tmp_rnn_desc, nullptr)); } cudaStreamSynchronize(nullptr); + weight_cached_ = true; } @@ -158,17 +179,72 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { ORT_ENFORCE(nullptr != X); // optional inputs - const Tensor* sequence_lens = ctx->Input(RNN_Input_Index::sequence_lens); // [batch_size] - const Tensor* initial_h = ctx->Input(RNN_Input_Index::initial_h); // initial hidden. [num_directions_, batch_size, hidden_size_] + // [batch_size] + const Tensor* sequence_lens = ctx->Input(RNN_Input_Index::sequence_lens); + // initial hidden. [num_directions_, batch_size, hidden_size_] + const Tensor* initial_h = ctx->Input(RNN_Input_Index::initial_h); const Tensor* initial_c(nullptr); if (rnn_mode_ == CUDNN_LSTM) { - initial_c = ctx->Input(RNN_Input_Index::initial_c); // initial cell. [num_directions_, batch_size, hidden_size_] + // initial cell. [num_directions_, batch_size, hidden_size_] + initial_c = ctx->Input(RNN_Input_Index::initial_c); } + size_t proj_size = hidden_size_; int64_t seq_length = X->Shape()[0]; int64_t batch_size = X->Shape()[1]; int64_t input_size = X->Shape()[2]; + // we thread a single input as sequence_lens of length 1, require to expand to [batch_size]? + std::vector sequence_lengths_temp; + if (!sequence_lens) { + sequence_lengths_temp.resize(batch_size, gsl::narrow_cast(seq_length)); + } + + const int32_t* sequence_lens_data = (sequence_lens == nullptr) + ? sequence_lengths_temp.data() + : sequence_lens->Data(); + + // cuDNN doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1 + // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence + int64_t zero_seq_count = 0; + std::vector zero_seq_index_cache(batch_size, 0); + + CudaAsyncBuffer sequence_lens_buffer(this, batch_size); + int32_t* seq_len_array = sequence_lens_buffer.CpuPtr(); + + // 0-len sequences are not supported by cuDNN. + // Replace them by sequences of len 1 and mask them out with SetZeroSequences + for (int i = 0; i < batch_size; ++i) { + if (0 == sequence_lens_data[i]) { + seq_len_array[i] = 1; + zero_seq_index_cache[zero_seq_count] = i; + ++zero_seq_count; + } else { + seq_len_array[i] = sequence_lens_data[i]; + } + } + + // Calculate the zero position cache for reverse direction if it's bidirectional + // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since + // we hacked the 0 sequence to 1 + if (zero_seq_count && num_directions_ > 1) { + zero_seq_index_cache.resize(zero_seq_count * num_directions_); + for (int64_t i = 0; i < zero_seq_count; ++i) { + zero_seq_index_cache[static_cast(zero_seq_count) + i] = + static_cast(batch_size + zero_seq_index_cache[i]); + } + zero_seq_count *= num_directions_; + } + + // Prior to cuDNN 8.9.1 the sequence lens buffer must be passed to cudnnRNNForward and thus is must + // be copied to the GPU always. + ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream())); + // Starting with cuDNN 8.9.1 the sequence lens buffer is ignored by cudnnRNNForward and thus it must + // be copied to the GPU only for the ReverseBySequence kernels. + // if (reverse_) { + // ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream())); + // } + // optional outputs TensorShapeVector dims_Y({seq_length, num_directions_, batch_size, hidden_size_}); TensorShapeVector dims_hxy({RNN_NUM_LAYERS * num_directions_, batch_size, hidden_size_}); @@ -177,25 +253,6 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { Tensor* Y_h = ctx->Output(Output_Index::Y_h, dims_hxy); Tensor* Y_c = ctx->Output(Output_Index::Y_c, dims_yc); - std::vector dims_x({batch_size, input_size, 1}); - std::vector dims_y({batch_size, hidden_size_ * num_directions_, 1}); - - CudnnTensor x_desc_temp; - ORT_RETURN_IF_ERROR(x_desc_temp.Set(dims_x, CudnnTensor::GetDataType())); - CudnnTensor y_desc_temp; - ORT_RETURN_IF_ERROR(y_desc_temp.Set(dims_y, CudnnTensor::GetDataType())); - std::vector x_desc(seq_length, x_desc_temp); - std::vector y_desc(seq_length, y_desc_temp); - - CudnnTensor hx_desc; - CudnnTensor cx_desc; - CudnnTensor y_h_desc; - CudnnTensor y_c_desc; - ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(y_h_desc.Set(dims_hxy, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(y_c_desc.Set(dims_hxy, CudnnTensor::GetDataType())); - IAllocatorUniquePtr x_reversed_data; const T* x_data = X->Data(); if (reverse_) { @@ -203,6 +260,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { x_reversed_data = GetScratchBuffer(seq_length * batch_size * input_size, ctx->GetComputeStream()); ReverseBySequence(Stream(ctx), gsl::narrow_cast(seq_length), + sequence_lens_buffer.GpuPtr(), gsl::narrow_cast(batch_size), gsl::narrow_cast(input_size), reinterpret_cast(x_data), @@ -226,115 +284,82 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { y_data = y_alloc_data.get(); } - const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->Data(); + const Tensor* B = ctx->Input(RNN_Input_Index::B); + bool has_bias = B != nullptr; CudnnRNN rnn_desc; - ORT_RETURN_IF_ERROR(rnn_desc.Set(GetCudnnHandle(ctx), + ORT_RETURN_IF_ERROR(rnn_desc.Set(input_size, hidden_size_, + proj_size, RNN_NUM_LAYERS, cudnn_dropout_desc_, cudnn_direction_mode_, rnn_mode_, - CudnnTensor::GetDataType(), - GetDeviceProp())); + has_bias, + CudnnTensor::GetDataType())); // Prepare the weight data + size_t w_data_size_in_bytes = 0; IAllocatorUniquePtr w_data; CudnnFilterDescriptor w_desc; if (!weight_cached_) { const Tensor& W = *ctx->Input(RNN_Input_Index::W); const Tensor& R = *ctx->Input(RNN_Input_Index::R); const Tensor* B = ctx->Input(RNN_Input_Index::B); - ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data, w_desc, rnn_desc, ctx->GetComputeStream())); + ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc, + rnn_desc, ctx->GetComputeStream())); } - // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences - CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc, CUDNN_RNN_PADDED_IO_ENABLED)); + CudnnDataTensor x_desc1; + ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType(), seq_length, batch_size, + input_size, seq_len_array)); + CudnnDataTensor y_desc1; + ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType(), seq_length, batch_size, + ((rnn_mode_ == CUDNN_LSTM) ? proj_size : hidden_size_) * num_directions_, + seq_len_array)); - size_t workspace_bytes; - CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(GetCudnnHandle(ctx), rnn_desc, gsl::narrow_cast(seq_length), x_desc.data(), &workspace_bytes)); - auto workspace_cuda = GetScratchBuffer(workspace_bytes, ctx->GetComputeStream()); - int64_t zero_seq_count = 0; - std::vector zero_seq_index_cache(batch_size, 0); - int64_t zero_seq_index_cache_size = 0; - - if (CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_ || nullptr == sequence_lens_data) { - CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInference(GetCudnnHandle(ctx), - rnn_desc, - gsl::narrow_cast(seq_length), - x_desc.data(), - x_data_input, - hx_desc, - hx_data, - cx_desc, - cx_data, - weight_cached_ ? w_desc_cache_ : w_desc, - weight_cached_ ? w_data_cache_.get() : w_data.get(), - y_desc.data(), - y_data, - y_h_desc, - y_h_data, - y_c_desc, - y_c_data, - workspace_cuda.get(), - workspace_bytes)); - } else { - // cudnn doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1 - // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence - std::vector seq_len_array(sequence_lens_data, sequence_lens_data + batch_size); - for (int i = 0; i < batch_size; ++i) { - if (0 == seq_len_array[i]) { - seq_len_array[i] = 1; - zero_seq_index_cache[zero_seq_count] = i; - ++zero_seq_count; - } - } + CudnnTensor cx_desc; + ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType())); - // Calculate the zero position cache for reverse direction if it's bidirectional - // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since - // we hacked the 0 sequence to 1 - if (zero_seq_count && num_directions_ > 1) { - zero_seq_index_cache_size = zero_seq_count * num_directions_; - zero_seq_index_cache.resize(zero_seq_index_cache_size); - for (int64_t i = 0; i < zero_seq_count; ++i) { - zero_seq_index_cache[static_cast(zero_seq_count) + i] = static_cast(batch_size + zero_seq_index_cache[i]); - } - } + CudnnTensor hx_desc; + ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType())); + + // reserveSpaceSize is not required cudnnRNNForward, but returned by cudnnGetRNNTempSpaceSizes + size_t workspace_bytes, reservespace_bytes; - CudnnDataTensor x_desc1; - ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType(), seq_length, batch_size, input_size, seq_len_array.data())); - CudnnDataTensor y_desc1; - ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType(), seq_length, batch_size, hidden_size_ * num_directions_, seq_len_array.data())); - - CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInferenceEx(GetCudnnHandle(ctx), - rnn_desc, - x_desc1, - x_data_input, - hx_desc, - hx_data, - cx_desc, - cx_data, - weight_cached_ ? w_desc_cache_ : w_desc, - weight_cached_ ? w_data_cache_.get() : w_data.get(), - y_desc1, - y_data, - y_h_desc, - y_h_data, - y_c_desc, - y_c_data, - nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, - workspace_cuda.get(), - workspace_bytes)); - - // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data. - if (nullptr == Y) { + CUDNN_RETURN_IF_ERROR(cudnnGetRNNTempSpaceSizes(GetCudnnHandle(ctx), rnn_desc, CUDNN_FWD_MODE_INFERENCE, + x_desc1, &workspace_bytes, &reservespace_bytes)); + auto workspace_cuda = GetScratchBuffer(workspace_bytes, ctx->GetComputeStream()); + auto reservespace_cuda = GetScratchBuffer(reservespace_bytes, ctx->GetComputeStream()); + + CUDNN_RETURN_IF_ERROR(cudnnRNNForward(GetCudnnHandle(ctx), + rnn_desc, + CUDNN_FWD_MODE_INFERENCE, + sequence_lens_buffer.GpuPtr(), // should be zero starting with cudnn 8.9.1 + x_desc1, + x_data_input, + y_desc1, + y_data, // output + hx_desc, + hx_data, // input + y_h_data, // output + cx_desc, cx_data, y_c_data, + weight_cached_ ? w_data_cache_size_in_bytes_ : w_data_size_in_bytes, + weight_cached_ ? w_data_cache_.get() : w_data.get(), + workspace_bytes, + workspace_cuda.get(), + reservespace_bytes, + reservespace_cuda.get())); + + // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, + // no need the following code to retrieve Y_h from Y data. + if (nullptr == Y) { + // Mask on output for 0 sequence batches + if (zero_seq_count > 0) { // Mask on output for 0 sequence batches - if (zero_seq_count > 0) { - SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream()); - } - return Status::OK(); + SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream()); } + return Status::OK(); } IAllocatorUniquePtr y_reorganized_data; @@ -345,6 +370,7 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { // reverse output data ReverseBySequence(Stream(ctx), gsl::narrow_cast(seq_length), + sequence_lens_buffer.GpuPtr(), gsl::narrow_cast(batch_size), gsl::narrow_cast(hidden_size_), reinterpret_cast(y_data), @@ -361,8 +387,9 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { } if (Y != nullptr) { - // User specified this optional output, so need to copy the reversed data to orignial place - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream(ctx))); + // User specified this optional output, so need to copy the reversed data to original place + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), + cudaMemcpyDeviceToDevice, Stream(ctx))); } else { y_data = y_reorganized_data.get(); } @@ -370,23 +397,9 @@ Status CudnnRnnBase::ComputeInternal(OpKernelContext* ctx) const { // Mask on output for 0 sequence batches if (zero_seq_count > 0) { - SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream()); + SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream()); } - if ((CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_) && sequence_lens_data != nullptr && y_h_data != nullptr && y_data != nullptr) { - CudaAsyncBuffer sequence_lens_buffer(this, batch_size); - memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t)); - ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream())); - RnnMaskImpl(Stream(ctx), - gsl::narrow_cast(num_directions_), - gsl::narrow_cast(seq_length), - gsl::narrow_cast(batch_size), - gsl::narrow_cast(hidden_size_), - sequence_lens_buffer.GpuPtr(), - reinterpret_cast(y_data), - reinterpret_cast(y_h_data), - output_size); - } return Status::OK(); } @@ -399,7 +412,8 @@ void CudnnRnnBase::SetZeroSequences(const int64_t zero_seq_index_cache_size, onnxruntime::Stream* ort_stream) const { typedef typename ToCudaType::MappedType CudaT; CudaAsyncBuffer zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size); - memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t)); + memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), + zero_seq_index_cache_size * sizeof(int32_t)); ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu(ort_stream)); cudaStream_t cuda_stream = ort_stream ? static_cast(ort_stream->GetHandle()) : nullptr; MaskZeroSequences(cuda_stream, diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h index 1c9483b2afd38..0fa01d3486e99 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h @@ -38,26 +38,28 @@ class CudnnRNN { } } - Status Set(const cudnnHandle_t& cudnnHandle, int64_t hidden_size, int num_layers, + Status Set(int64_t input_size, int64_t hidden_size, int64_t proj_size, int num_layers, cudnnDropoutDescriptor_t cudnn_dropout_desc, cudnnDirectionMode_t cudnn_direction_model, - cudnnRNNMode_t rnn_mode, cudnnDataType_t dataType, const cudaDeviceProp& prop) { + cudnnRNNMode_t rnn_mode, bool has_bias, cudnnDataType_t dataType) { if (!cudnn_rnn_desc_) CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_)); - CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle, - cudnn_rnn_desc_, + CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v8(cudnn_rnn_desc_, + CUDNN_RNN_ALGO_STANDARD, // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC + rnn_mode, + has_bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS, + cudnn_direction_model, + CUDNN_LINEAR_INPUT, + dataType, + dataType, + dataType == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH, + gsl::narrow_cast(input_size), gsl::narrow_cast(hidden_size), + gsl::narrow_cast(proj_size), // projected size num_layers, cudnn_dropout_desc, - CUDNN_LINEAR_INPUT, // We can also skip the input matrix transformation - cudnn_direction_model, - rnn_mode, - CUDNN_RNN_ALGO_STANDARD, // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC - dataType)); - - if (prop.major >= 7 && dataType == CUDNN_DATA_HALF) { - cudnnSetRNNMatrixMathType(cudnn_rnn_desc_, CUDNN_TENSOR_OP_MATH); - } + // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences + CUDNN_RNN_PADDED_IO_ENABLED)); return Status::OK(); } @@ -119,8 +121,7 @@ class CudnnRnnBase : public CudaKernel { private: Status SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle, const cudnnRNNDescriptor_t rnn_desc, - const cudnnTensorDescriptor_t x_desc, - const cudnnFilterDescriptor_t w_desc, + size_t w_data_size, void* w_data, const T* W_data, const T* R_data, @@ -128,23 +129,22 @@ class CudnnRnnBase : public CudaKernel { cudaStream_t cuda_stream) const; Status ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B, + size_t& target_w_data_size_in_bytes, IAllocatorUniquePtr& target_w_data, CudnnFilterDescriptor& target_w_desc, CudnnRNN& rnn_desc, onnxruntime::Stream* ort_stream) const; - void SetWeightBias(const cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnn_desc, - const int pseudo_layer, - const cudnnTensorDescriptor_t x_desc, - const cudnnFilterDescriptor_t w_desc, - const cudnnFilterDescriptor_t filter_desc, - const void* w_data, - const int lin_layer_id, - const T* pos, - int& offset, - bool is_matrix, - cudaStream_t cuda_stream) const; + Status SetWeightBias(const cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnn_desc, + const int pseudo_layer, + size_t w_data_size, + const void* w_data, + const int lin_layer_id, + const T* pos, + int& offset, + bool is_matrix, + cudaStream_t cuda_stream) const; void SetZeroSequences(const int64_t zero_seq_index_cache_size, const std::vector zero_seq_index_cache, @@ -167,6 +167,7 @@ class CudnnRnnBase : public CudaKernel { cudnnRNNMode_t rnn_mode_; // w_desc_cache_ & w_data_cache_ are changed in Constructor if we can get the weights as constant input CudnnFilterDescriptor w_desc_cache_; + size_t w_data_cache_size_in_bytes_; IAllocatorUniquePtr w_data_cache_; bool weight_cached_; int64_t layout_; diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.cc b/onnxruntime/core/providers/cuda/rnn/rnn.cc index 4bd22340ef2bb..ed8be63679707 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn.cc +++ b/onnxruntime/core/providers/cuda/rnn/rnn.cc @@ -1,8 +1,9 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/shared_library/provider_api.h" #include "rnn.h" + +#include "core/providers/shared_library/provider_api.h" #include "rnn_impl.h" #include "core/providers/cuda/cudnn_common.h" diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.h b/onnxruntime/core/providers/cuda/rnn/rnn.h index e4e50046b3725..6221afb003b22 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn.h +++ b/onnxruntime/core/providers/cuda/rnn/rnn.h @@ -4,6 +4,7 @@ #pragma once #include "cudnn_rnn_base.h" + #include "core/providers/cuda/cuda_common.h" #include diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu index d485855ddb417..94c8036be6cdf 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu @@ -8,22 +8,32 @@ namespace onnxruntime { namespace cuda { template -__global__ void _ReverseBySequenceKernel(const int32_t seq_length, +__global__ void _ReverseBySequenceKernel(const int32_t max_seq_length, + const int32_t* seq_lengths, const int32_t block_size, const fast_divmod div_batch_block, + const fast_divmod div_input_or_hidden_size, const T* data, T* reversed_data, const CUDA_LONG N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); int seq_id, offset; div_batch_block.divmod(id, seq_id, offset); - int org_id = (seq_length - seq_id - 1) * block_size + offset; - reversed_data[id] = data[org_id]; + int batch, batch_offset; + div_input_or_hidden_size.divmod(offset, batch, batch_offset); + int seq_id_org = seq_lengths[batch] - seq_id - 1; + if (seq_id_org >= 0) { + int org_id = seq_id_org * block_size + offset; + reversed_data[id] = data[org_id]; + } else { + reversed_data[id] = T{}; + } } template void ReverseBySequence(cudaStream_t stream, - const int32_t seq_length, + const int32_t max_seq_length, + const int32_t *seq_lengths, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -32,9 +42,10 @@ void ReverseBySequence(cudaStream_t stream, // kerneral int32_t block_size = batch_size * input_or_hidden_size; fast_divmod div_batch_block(block_size); + fast_divmod div_input_or_hidden_size(input_or_hidden_size); int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); _ReverseBySequenceKernel<<>>( - seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N); + max_seq_length, seq_lengths, block_size, div_batch_block, div_input_or_hidden_size, data, reversed_data, (CUDA_LONG)N); } template @@ -82,60 +93,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream, data, reordered_data, (CUDA_LONG)N); } -template -__global__ void _RnnMaskKernel(const int32_t seq_length, - const int32_t batch_size, - const int32_t hidden_size, - const int32_t* sequence_lens, - const fast_divmod div_seq_block, - const fast_divmod div_dir_block, - const fast_divmod div_batch_block, - T* y_output_data, - T* y_h_output_data, - const CUDA_LONG N) { - CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); - - int seq_id, direction_id, batch_id, offset; - div_seq_block.divmod(id, seq_id, offset); - div_dir_block.divmod(offset, direction_id, offset); - div_batch_block.divmod(offset, batch_id, offset); - int32_t batch_seq_length = sequence_lens[batch_id]; - - if (batch_id >= batch_size || batch_seq_length == seq_length) { - return; - } - - if (seq_id >= batch_seq_length) { - y_output_data[id] = 0; - return; - } - - if ((y_h_output_data != nullptr) && - ((direction_id == 0 && (seq_id + 1) == batch_seq_length) || (direction_id == 1 && seq_id == 0))) { - int hy_idx = direction_id * batch_size * hidden_size + batch_id * hidden_size + offset; - y_h_output_data[hy_idx] = y_output_data[id]; - } -} - -template -void RnnMaskImpl(cudaStream_t stream, - const int32_t num_directions, - const int32_t seq_length, - const int32_t batch_size, - const int32_t hidden_size, - const int32_t* sequence_lens, - T* y_output_data, - T* y_h_output_data, - const size_t N) { - fast_divmod div_seq_block(batch_size * hidden_size * num_directions); - fast_divmod div_dir_block(batch_size * hidden_size); - fast_divmod div_batch_block(hidden_size); - int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - _RnnMaskKernel<<>>( - seq_length, batch_size, hidden_size, sequence_lens, div_seq_block, - div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N); -} - template __global__ void _MaskZeroSequences(const int32_t hidden_size, T* y_output_data, @@ -180,17 +137,9 @@ void MaskZeroSequences(cudaStream_t stream, } #define SPECIALIZED_RNN_IMPL(T) \ - template void RnnMaskImpl(cudaStream_t stream, \ - const int32_t num_directions, \ - const int32_t seq_length, \ - const int32_t batch_size, \ - const int32_t hidden_size, \ - const int32_t* sequence_lens, \ - T* y_output_data, \ - T* y_h_output_data, \ - const size_t N); \ - template void ReverseBySequence(cudaStream_t stream, \ - const int32_t seq_length, \ + template void ReverseBySequence(cudaStream_t stream, \ + const int32_t max_seq_length, \ + const int32_t* seq_lengths, \ const int32_t batch_size, \ const int32_t hidden_size, \ const T* data, \ @@ -203,7 +152,7 @@ void MaskZeroSequences(cudaStream_t stream, const T* data, \ T* reordered_data, \ const size_t N); \ -template void MaskZeroSequences(cudaStream_t stream, \ +template void MaskZeroSequences(cudaStream_t stream, \ const int32_t hidden_size, \ T* y_output_data, \ T* y_h_output_data, \ diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h index 9844e04ff6ec5..ba876011f6b67 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h @@ -10,7 +10,8 @@ namespace cuda { template void ReverseBySequence(cudaStream_t stream, - const int32_t seq_length, + const int32_t max_seq_length, + const int32_t* seq_lengths, const int32_t batch_size, const int32_t input_or_hidden_size, const T* data, @@ -26,17 +27,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream, T* reordered_data, const size_t N); -template -void RnnMaskImpl(cudaStream_t stream, - const int32_t num_directions, - const int32_t seq_length, - const int32_t batch_size, - const int32_t hidden_size, - const int32_t* sequence_lens, - T* y_output_data, - T* y_h_output_data, - const size_t N); - template void MaskZeroSequences(cudaStream_t stream, const int32_t hidden_size, diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc index b9875b9553a55..1a31743e2f7e7 100644 --- a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc +++ b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc @@ -120,15 +120,11 @@ TEST(RNNTest, RNN_bidirectional_bias_initial_zigged_batch) { test.AddOutput("Y_h", Y_h_dims, Y_h_data); // TensorRT failed on RNN tests - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } // Doesn't work with CUDA 11.4 on Windows. Need investigation. -#if defined(USE_CUDA) && defined(_WIN32) -TEST(RNNTest, DISABLED_RNN_bidirectional_zigged_batch) { -#else TEST(RNNTest, RNN_bidirectional_zigged_batch) { -#endif OpTester test("RNN"); int64_t num_directions = 2, input_size = 2, hidden_size = 3, seq_length = 5; @@ -275,15 +271,11 @@ TEST(RNNTest, RNN_reverse_direction_zigged_batch) { std::vector Y_h_data({0.87014002F, 0.09402763F, -0.54269236F, 0.64809889F, -0.19472955F, -0.24271242F}); test.AddOutput("Y_h", Y_h_dims, Y_h_data); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } // Doesn't work with CUDA 11.4 on Windows. Need investigation. -#if defined(USE_CUDA) && defined(_WIN32) -TEST(RNNTest, DISABLED_RNN_forward_direction_zigged_batch) { -#else TEST(RNNTest, RNN_forward_direction_zigged_batch) { -#endif OpTester test("RNN"); int64_t num_directions = 1, input_size = 2, hidden_size = 3, seq_length = 5; @@ -357,12 +349,7 @@ TEST(RNNTest, RNN_forward_direction_zigged_batch) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -// Doesn't work with CUDA 11.4 on Windows. Need investigation. -#if defined(USE_CUDA) && defined(_WIN32) -TEST(RNNTest, DISABLED_RNN_bidirectional_0) { -#else TEST(RNNTest, RNN_bidirectional_0) { -#endif OpTester test("RNN"); int64_t num_directions = 2, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5; @@ -424,12 +411,7 @@ TEST(RNNTest, RNN_bidirectional_0) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } -// Doesn't work with CUDA 11.4 on Windows. Need investigation. -#if defined(USE_CUDA) && defined(_WIN32) -TEST(RNNTest, DISABLED_RNN_bidirectional_1) { -#else TEST(RNNTest, RNN_bidirectional_1) { -#endif OpTester test("RNN"); int64_t num_directions = 2, input_size = 2, hidden_size = 2, batch_size = 1, seq_length = 1; @@ -597,7 +579,7 @@ TEST(RNNTest, DISABLED_RNN_default_attributes_and_forward_direction) { } } -TEST(RNNTest, DISABLED_RNN_reverse_direction) { +TEST(RNNTest, RNN_reverse_direction) { int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5; // In case of useDefault, attributes, inputs or outputs are not set. From aec2389ad0463d218b8cf3b1e245d4c34e98364a Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 23 Feb 2024 12:52:47 -0800 Subject: [PATCH 41/68] [js/webgpu] allows a ProgramInfo's RunData to use zero sized output (#19614) ### Description This PR allows zero-sized output. To make the implementation simple, it does not support partial zero-sized tensor. Which means, either all outputs are zero-sized, or an error will be reported. added 2 tests: - op test of `Add` with input T[2,0] T[2,1], and - test_split_zero_size_splits --- js/web/lib/wasm/jsep/backend-webgpu.ts | 32 ++++++++++++++++++++++---- js/web/lib/wasm/jsep/init.ts | 3 ++- js/web/lib/wasm/jsep/util.ts | 11 ++++++++- js/web/test/data/ops/add.jsonc | 22 ++++++++++++++++++ js/web/test/suite-test-list.jsonc | 2 +- js/web/test/test-runner.ts | 10 ++++++-- 6 files changed, 71 insertions(+), 9 deletions(-) diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts index 98990a6fe477b..3e3a191ec3ead 100644 --- a/js/web/lib/wasm/jsep/backend-webgpu.ts +++ b/js/web/lib/wasm/jsep/backend-webgpu.ts @@ -385,11 +385,16 @@ export class WebGpuBackend { // create info for inputs const inputDatas: GpuData[] = []; for (let i = 0; i < inputTensorViews.length; ++i) { - const gpuData = this.gpuDataManager.get(inputTensorViews[i].data); + const data = inputTensorViews[i].data; + // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it. + if (data === 0) { + continue; + } + const gpuData = this.gpuDataManager.get(data); if (!gpuData) { - throw new Error(`no GPU data for input: ${inputTensorViews[i].data}`); + throw new Error(`no GPU data for input: ${data}`); } - inputDatas[i] = gpuData; + inputDatas.push(gpuData); } const {outputs, dispatchGroup, programUniforms} = program.getRunData(inputTensorViews); @@ -419,6 +424,11 @@ export class WebGpuBackend { const tensorView = (isTemporary || isPersistent) ? createIntermediateOutput(outputs[i].dataType, outputs[i].dims) : createKernelOutput(validatedOutputIndices[i], outputs[i].dataType, outputs[i].dims); + outputTensorViews.push(tensorView); + // if tensor view data is 0, it means the output is zero-sized tensor, and there is no GPU data for it. + if (tensorView.data === 0) { + continue; + } const gpuData = this.gpuDataManager.get(tensorView.data); if (!gpuData) { throw new Error(`no GPU data for output: ${tensorView.data}`); @@ -434,10 +444,24 @@ export class WebGpuBackend { } persistentData.push(gpuData); } - outputTensorViews.push(tensorView); outputDatas.push(gpuData); } + // when there are any zero-sized tensor in the inputs or outputs, we should report error unless all outputs are + // zero-sized tensors. + if (inputDatas.length !== inputTensorViews.length || outputDatas.length !== outputTensorViews.length) { + // if all outputs are zero-sized tensors, there is no need to run the program. + if (outputDatas.length === 0) { + TRACE_FUNC_END(program.name); + return outputTensorViews; + } + // if some outputs are zero-sized tensors, report an error. + // + // TODO: so far we don't see any use case that outputs include both zero-sized tensors and non-zero-sized tensors. + // If we see such use case, we need to make a change here to support it. + throw new Error( + `Program ${program.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`); + } // load uniforms // TODO: add cache for uniform (is it necessary?) diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts index 786ae41646554..b64abf9cc5424 100644 --- a/js/web/lib/wasm/jsep/init.ts +++ b/js/web/lib/wasm/jsep/init.ts @@ -104,7 +104,8 @@ class ComputeContextImpl implements ComputeContext { throw new Error(`Unsupported data type: ${dataType}`); } const bufferSize = elementSize * ShapeUtil.size(dims); - return new TensorViewImpl(this.module, dataType, this.backend.gpuDataManager.create(bufferSize).id, dims); + const gpuDataId = bufferSize > 0 ? this.backend.gpuDataManager.create(bufferSize).id : 0; + return new TensorViewImpl(this.module, dataType, gpuDataId, dims); }; return this.backend.run(program, mappedInputs, outputIndices, createKernelOutput, createTemporaryOutput); } diff --git a/js/web/lib/wasm/jsep/util.ts b/js/web/lib/wasm/jsep/util.ts index c0517ce363644..9a1d5463f7843 100644 --- a/js/web/lib/wasm/jsep/util.ts +++ b/js/web/lib/wasm/jsep/util.ts @@ -56,7 +56,16 @@ export class BroadcastUtil { if (aLen !== bLen && aLen > 1 && bLen > 1) { return undefined; } - cdims[crank - i] = Math.max(aLen, bLen); + const max = Math.max(aLen, bLen); + if (aLen && bLen) { + cdims[crank - i] = Math.max(aLen, bLen); + } else { + // when either aLen or bLen is 0, the other should be either 0 or 1, otherwise it is not broadcastable. + if (max > 1) { + return undefined; + } + cdims[crank - i] = 0; + } } return cdims; diff --git a/js/web/test/data/ops/add.jsonc b/js/web/test/data/ops/add.jsonc index e5b4ff2b53148..dd15134861ef0 100644 --- a/js/web/test/data/ops/add.jsonc +++ b/js/web/test/data/ops/add.jsonc @@ -157,6 +157,28 @@ "type": "float32" } ] + }, + { + "name": "T[2,0] T[2,1]", + "inputs": [ + { + "data": [], + "dims": [2, 0], + "type": "float32" + }, + { + "data": [1, 2], + "dims": [2, 1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [], + "dims": [2, 0], + "type": "float32" + } + ] } ] } diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index b43b1ac37e37d..88555a27be82e 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1231,7 +1231,7 @@ "test_split_variable_parts_1d", "test_split_variable_parts_2d", "test_split_variable_parts_default_axis", - // // "test_split_zero_size_splits", + "test_split_zero_size_splits", "test_sqrt_example", "test_sqrt", "test_squeeze_negative_axes", diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index a4adf5c4ce144..7c03e5b915fd7 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -573,7 +573,9 @@ export async function sessionRun(options: { // replace the CPU tensors in feeds into GPU tensors for (const name in feeds) { if (Object.hasOwnProperty.call(feeds, name)) { - feeds[name] = createGpuTensorForInput(feeds[name]); + if (feeds[name].size > 0) { + feeds[name] = createGpuTensorForInput(feeds[name]); + } } } } @@ -582,7 +584,11 @@ export async function sessionRun(options: { for (const name in options.outputsMetaInfo) { if (Object.hasOwnProperty.call(options.outputsMetaInfo, name)) { const {type, dims} = options.outputsMetaInfo[name]; - fetches[name] = createGpuTensorForOutput(type, dims); + if (dims.some(d => d === 0)) { + fetches[name] = new ort.Tensor(type, [], dims); + } else { + fetches[name] = createGpuTensorForOutput(type, dims); + } } } } From bb43a0f1338b05e93fcbbe5c5cb53ebf017625ba Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Fri, 23 Feb 2024 15:45:30 -0800 Subject: [PATCH 42/68] [js/webgpu] minor fixes to make tinyllama work (#19564) --- js/web/lib/wasm/jsep/webgpu/ops/concat.ts | 4 +++- js/web/lib/wasm/jsep/webgpu/ops/gather.ts | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts index b06c9fb496d15..b142a82e551a7 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/concat.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/concat.ts @@ -154,7 +154,9 @@ const createConcatProgramInfo = (inputs: readonly TensorView[], axis: number): P export const concat = (context: ComputeContext, attributes: ConcatAttributes): void => { validateInputs(context.inputs); - context.compute(createConcatProgramInfo(context.inputs, attributes.axis)); + // 0 length tensors are valid for concat, remove them + const nonEmptyInputs = context.inputs.filter(input => ShapeUtil.size(input.dims) > 0); + context.compute(createConcatProgramInfo(nonEmptyInputs, attributes.axis), {inputs: nonEmptyInputs}); }; export const parseConcatAttributes = (attributes: Record): ConcatAttributes => diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts index 5c31e6dd86c00..d48bb909f7f8f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts @@ -55,7 +55,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath if (idx${x} < 0) { idx${x} = idx${x} + uniforms.axisDimLimit; } - var dataIndices${x} = ${data.type.indices}(0); + var dataIndices${x} : ${data.type.indices}; `; for (let i = 0, j = 0; i < inputRank; i++) { if (i === axis) { From 46c4d7fe4ad457d517fe92db7681c38849c51beb Mon Sep 17 00:00:00 2001 From: Sheil Kumar Date: Fri, 23 Feb 2024 18:20:22 -0800 Subject: [PATCH 43/68] Disable gemm activation for non-float data types (#19612) ### Description Disable gemm activation for non-float data types ### Motivation and Context When a float16 model contains a Gemm+Relu subgraph, the gemm_activation_fusion will kick in and cause the two nodes to be eliminated and replaced with a FusedGemm. This however is only registered for the float data type. This causes model load failures. Disable the fusion for non-float data types. --------- Co-authored-by: Sheil Kumar --- onnxruntime/core/optimizer/gemm_activation_fusion.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/onnxruntime/core/optimizer/gemm_activation_fusion.cc b/onnxruntime/core/optimizer/gemm_activation_fusion.cc index c62887da09fdc..50be2cbd48f7b 100644 --- a/onnxruntime/core/optimizer/gemm_activation_fusion.cc +++ b/onnxruntime/core/optimizer/gemm_activation_fusion.cc @@ -56,6 +56,13 @@ Status GemmActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l continue; } + NodeArg* node_output = node.MutableOutputDefs()[0]; + auto data_type = node_output->TypeAsProto()->tensor_type().elem_type(); + if (data_type != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { + // FusedGemm is only registered for float data type in fused_gemm.cc! + continue; + } + const Node& next_node = *(node.OutputNodesBegin()); if (!IsFusableActivation(next_node) || next_node.GetExecutionProviderType() != node.GetExecutionProviderType()) { continue; From c12a20bef95df5437189687b94e7ba2f1bad1505 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sat, 24 Feb 2024 14:06:30 +1000 Subject: [PATCH 44/68] Add helper to run CIs for a branch using `az pipelines`. (#16843) ### Description Add helper to run CIs for a branch using `az pipelines`. This can be used to easily kick off multiple CIs for a branch prior to creating a PR. Update run_CIs_for_external_pr.py so the CI list can be shared. Request json output from `gh pr view` so the current state is more easily parsed. ### Motivation and Context --- tools/python/run_CIs_for_branch.py | 116 +++++++++++++++++++++++ tools/python/run_CIs_for_external_pr.py | 120 +++++++++++++----------- 2 files changed, 181 insertions(+), 55 deletions(-) create mode 100644 tools/python/run_CIs_for_branch.py diff --git a/tools/python/run_CIs_for_branch.py b/tools/python/run_CIs_for_branch.py new file mode 100644 index 0000000000000..c507cae0d9f43 --- /dev/null +++ b/tools/python/run_CIs_for_branch.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import argparse +import json +import os +import subprocess +import sys +import typing + +from run_CIs_for_external_pr import get_pipeline_names +from util.platform_helpers import is_windows + + +def _parse_args(): + parser = argparse.ArgumentParser( + os.path.basename(__file__), + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""Run the CIs used to validate PRs for the specified branch. + + If specified, the `--include` filter is applied first, followed by any `--exclude` filter. + + Requires the Azure CLI with DevOps extension to be installed. + Azure CLI: https://learn.microsoft.com/en-us/cli/azure/install-azure-cli + DevOps extension: https://github.com/Azure/azure-devops-cli-extension + + Configuration: + Login:`az login` + Configure ORT repo as default: + `az devops configure --defaults organization=https://dev.azure.com/onnxruntime project=onnxruntime` + + Example usage: + List all CIs + `python run_CIs_for_branch.py --dry-run my/BranchName` + Run all CIs + `python run_CIs_for_branch.py my/BranchName` + Run only Linux CIs + `python run_CIs_for_branch.py --include linux my/BranchName` + Exclude training CIs + `python run_CIs_for_branch.py --exclude training my/BranchName` + Run non-training Linux CIs + `python run_CIs_for_branch.py --include linux --exclude training my/BranchName` + """, + ) + + parser.add_argument("-i", "--include", type=str, help="Include CIs that match this string. Case insensitive.") + parser.add_argument("-e", "--exclude", type=str, help="Exclude CIs that match this string. Case insensitive.") + parser.add_argument("--dry-run", action="store_true", help="Print selected CIs but do not run them.") + parser.add_argument("branch", type=str, help="Specify the branch to run.") + + args = parser.parse_args() + return args + + +def _run_az_pipelines_command(command: typing.List[str]): + try: + az = "az.cmd" if is_windows() else "az" + az_output = subprocess.run([az, "pipelines", *command], capture_output=True, text=True, check=True) + except subprocess.CalledProcessError as cpe: + print(cpe) + print(cpe.stderr) + sys.exit(-1) + + return az_output + + +def main(): + args = _parse_args() + branch = args.branch + + # To debug available pipelines: + # az_out = az_pipelines = _run_az_pipelines_command(["list"]) + # pipeline_info = json.loads(az_out.stdout) + # print(pipeline_info) + + pipelines = get_pipeline_names() + pipelines_to_run = [] + if args.include: + value = args.include.lower().strip() + for p in pipelines: + if value in p.lower(): + print(f"Including {p}") + pipelines_to_run.append(p) + else: + pipelines_to_run = pipelines + + if args.exclude: + value = args.exclude.lower().strip() + cur_pipelines = pipelines_to_run + pipelines_to_run = [] + for p in cur_pipelines: + if value in p.lower(): + print(f"Excluding {p}") + else: + pipelines_to_run.append(p) + + print("Pipelines to run:") + for p in pipelines_to_run: + print(f"\t{p}") + + if args.dry_run: + sys.exit(0) + + for pipeline in pipelines_to_run: + az_out = _run_az_pipelines_command(["run", "--branch", branch, "--name", pipeline]) + run_output = json.loads(az_out.stdout) + if "id" in run_output: + build_url = f"https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId={run_output['id']}" + print(f"{pipeline} build results: {build_url}&view=results") + else: + raise ValueError("Build id was not found in az output:\n" + run_output) + + +if __name__ == "__main__": + main() diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py index df4e70b1e51fe..dcafe898b3bdf 100644 --- a/tools/python/run_CIs_for_external_pr.py +++ b/tools/python/run_CIs_for_external_pr.py @@ -3,13 +3,54 @@ # Licensed under the MIT License. import argparse +import json import os import subprocess import sys import typing -def parse_args(): +def get_pipeline_names(): + # Current pipelines. These change semi-frequently and may need updating. + # There is no easy way to get the list of "required" pipelines using `azp` before they are run, + # so we need to maintain this list manually. + # NOTE: This list is also used by run_CIs_for_branch.py + pipelines = [ + # windows + "Windows ARM64 QNN CI Pipeline", + "Windows x64 QNN CI Pipeline", + "Windows CPU CI Pipeline", + "Windows GPU CI Pipeline", + "Windows GPU TensorRT CI Pipeline", + "ONNX Runtime Web CI Pipeline", + # linux + "Linux CPU CI Pipeline", + "Linux CPU Minimal Build E2E CI Pipeline", + "Linux GPU CI Pipeline", + "Linux GPU TensorRT CI Pipeline", + "Linux OpenVINO CI Pipeline", + "Linux QNN CI Pipeline", + # mac + "MacOS CI Pipeline", + # training + "orttraining-amd-gpu-ci-pipeline", + "orttraining-linux-ci-pipeline", + "orttraining-linux-gpu-ci-pipeline", + "orttraining-ortmodule-distributed", + # checks + "onnxruntime-binary-size-checks-ci-pipeline", + # big models + "Big Models", + # not currently required, but running ensures we're hitting all mobile platforms + "Android CI Pipeline", + "iOS CI Pipeline", + "ONNX Runtime React Native CI Pipeline", + ] + + return pipelines + + +def _parse_args(): parser = argparse.ArgumentParser( os.path.basename(__file__), formatter_class=argparse.RawDescriptionHelpFormatter, @@ -25,7 +66,7 @@ def parse_args(): return args -def run_gh_pr_command(command: typing.List[str], check=True): +def run_gh_pr_command(command: typing.List[str], check: bool = True): try: return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=check) except subprocess.CalledProcessError as cpe: @@ -35,23 +76,25 @@ def run_gh_pr_command(command: typing.List[str], check=True): def main(): - args = parse_args() + args = _parse_args() pr_id = args.pr # validate PR - gh_out = run_gh_pr_command(["view", pr_id]) - info = gh_out.stdout.split("\n") - for line in info: - pieces = line.split("\t") - if len(pieces) != 2: - continue - - if pieces[0] == "state:": - if pieces[1] != "OPEN": - print(f"PR {pr_id} is not OPEN. Currently in state {pieces[1]}.") - sys.exit(-1) - - print("Check passed pipelines") + print("Checking PR is open") + gh_out = run_gh_pr_command(["view", "--json", "state", pr_id]) + info = json.loads(gh_out.stdout) + if "state" not in info: + print(f"Could not get current state from `gh pr view` response of\n{gh_out.stdout}") + sys.exit(-1) + + if info["state"] != "OPEN": + print(f"PR {pr_id} is not OPEN. Currently in state {info['state']}.") + sys.exit(0) + + # This will return CIs that have run previously but not passed. We filter the CIs to run based on this, so it's + # fine for the initial response to have no info in it. + # `gh pr checks` exits with non-zero exit code when failures in pipeline exist, so we set `check` to False. + print("Checking for pipelines that have passed.") gh_out = run_gh_pr_command(["checks", pr_id, "--required"], check=False) # output format is a tab separated list of columns: # (pipeline name) "\t" (status) "\t" (ran time) "\t" (url) @@ -61,54 +104,21 @@ def main(): if len(columns) == 4 and columns[1] == "pass" ] - print("Adding azp run commands") - - # Current pipelines. These change semi-frequently and may need updating. - # - # Note: there is no easy way to get the list for azp "required" pipelines before they starts. - # we need to maintain this list manually. - # - pipelines = [ - # windows - "Windows ARM64 QNN CI Pipeline", - "Windows x64 QNN CI Pipeline", - "Windows CPU CI Pipeline", - "Windows GPU CI Pipeline", - "Windows GPU TensorRT CI Pipeline", - "ONNX Runtime Web CI Pipeline", - # linux - "Linux CPU CI Pipeline", - "Linux CPU Minimal Build E2E CI Pipeline", - "Linux GPU CI Pipeline", - "Linux GPU TensorRT CI Pipeline", - "Linux OpenVINO CI Pipeline", - "Linux QNN CI Pipeline", - # mac - "MacOS CI Pipeline", - # training - "orttraining-amd-gpu-ci-pipeline", - "orttraining-linux-ci-pipeline", - "orttraining-linux-gpu-ci-pipeline", - "orttraining-ortmodule-distributed", - # checks - "onnxruntime-python-checks-ci-pipeline", - "onnxruntime-binary-size-checks-ci-pipeline", - # big models - "Big Models", - # not currently required, but running ensures we're hitting all mobile platforms - "Android CI Pipeline", - "iOS CI Pipeline", - "ONNX Runtime React Native CI Pipeline", - ] + pipelines = get_pipeline_names() # remove pipelines that have already run successfully pipelines = [p for p in pipelines if p not in checked_pipelines] + print("Pipelines to run:") + for p in pipelines: + print("\t" + p) + # azp run is limited to 10 pipelines at a time max_pipelines_per_comment = 10 start = 0 num_pipelines = len(pipelines) + print("Adding azp run commands") while start < num_pipelines: end = start + max_pipelines_per_comment if end > num_pipelines: From 9ccdc4961ad76355289ed3a36ccb8307e8dc7789 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 23 Feb 2024 22:31:57 -0800 Subject: [PATCH 45/68] Stop using apiset in OneCore build: use onecoreuap.lib instead of onecoreuap_apiset.lib (#19632) ### Description Stop using apiset in OneCore build: use onecoreuap.lib instead of onecoreuap_apiset.lib in onecore build. ### Motivation and Context 1. Now all Windows Editions come with Reverse Forwarders. We should just use the normal onecore libs. 2. Many new Windows APIs are only available in [windows umbrella libraries](https://learn.microsoft.com/en-us/windows/win32/apiindex/windows-umbrella-libraries). So these libraries are not specific for Windows CoreOS or Onecore. 3. Going forward we should use "IsApiSetImplemented" to guard our API usages: https://learn.microsoft.com/en-us/windows/win32/apiindex/detect-api-set-availability . After this change, our built binaries can pass apivalidator's check. ``` C:\local\apivalidator>apivalidator.exe -BinaryPath:C:\src\onnxruntime\b\Debug\Debug\onnxruntime.dll -SupportedApiXmlFiles:onecoreuap_DDIs.xml ApiValidation: Summary: "C:\src\onnxruntime\b\Debug\Debug\onnxruntime.dll" is Universal ApiValidation: All binaries are Universal ``` So it will give an easy way to test ONNX Runtime's compatibility to Windows versions. --- cmake/CMakeLists.txt | 6 ++---- cmake/wcos_rules_override.cmake | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index c9be4aa65d0cc..ed9043f2adc4a 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1729,14 +1729,12 @@ if(onnxruntime_BUILD_KERNEL_EXPLORER) endif() # When GDK_PLATFORM is set then WINAPI_FAMILY is defined in gdk_toolchain.cmake (along with other relevant flags/definitions). -if (WIN32 AND NOT GDK_PLATFORM) +if (WIN32 AND NOT GDK_PLATFORM AND NOT CMAKE_CROSSCOMPILING) if (NOT CMAKE_CXX_STANDARD_LIBRARIES MATCHES kernel32.lib) # On onecore, link to the onecore build of the MSVC runtime get_filename_component(msvc_path "${CMAKE_C_COMPILER}/../../../.." ABSOLUTE) link_directories(BEFORE "${msvc_path}/lib/onecore/${onnxruntime_target_platform}") - # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, which in turn links to reverse forwarders. - # We ignore that entry and use onecore_apiset.lib instead, since system components must not rely on reverse forwarders. - add_link_options("/NODEFAULTLIB:onecore.lib") + # The .lib files in the MSVC runtime have a DEFAULITLIB entry for onecore.lib, but it shold not cause any conflict with onecoreuap.lib endif() endif() diff --git a/cmake/wcos_rules_override.cmake b/cmake/wcos_rules_override.cmake index f3d8093629a42..ec2303b073d5e 100644 --- a/cmake/wcos_rules_override.cmake +++ b/cmake/wcos_rules_override.cmake @@ -1,2 +1,2 @@ -set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib) -set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap_apiset.lib) +set(CMAKE_C_STANDARD_LIBRARIES_INIT onecoreuap.lib) +set(CMAKE_CXX_STANDARD_LIBRARIES_INIT onecoreuap.lib) From 0edb03580823c9d9e97ba1a6ea941fcd70a2500b Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Sat, 24 Feb 2024 10:09:07 -0800 Subject: [PATCH 46/68] [js/web] fix suite test list for zero sized tensor (#19638) ### Description Fixes build break brought by #19614 Currently WebGL backend does not support zero sized tensor. This change split test data into 2 parts, and only enable zero sized tensor tests for WebGPU. --- js/web/test/data/ops/add.jsonc | 22 - js/web/test/data/ops/add_zero-sized.jsonc | 31 + js/web/test/data/ops/concat_zero-sized.jsonc | 561 +++++++++++++++++++ js/web/test/suite-test-list.jsonc | 2 + 4 files changed, 594 insertions(+), 22 deletions(-) create mode 100644 js/web/test/data/ops/add_zero-sized.jsonc create mode 100644 js/web/test/data/ops/concat_zero-sized.jsonc diff --git a/js/web/test/data/ops/add.jsonc b/js/web/test/data/ops/add.jsonc index dd15134861ef0..e5b4ff2b53148 100644 --- a/js/web/test/data/ops/add.jsonc +++ b/js/web/test/data/ops/add.jsonc @@ -157,28 +157,6 @@ "type": "float32" } ] - }, - { - "name": "T[2,0] T[2,1]", - "inputs": [ - { - "data": [], - "dims": [2, 0], - "type": "float32" - }, - { - "data": [1, 2], - "dims": [2, 1], - "type": "float32" - } - ], - "outputs": [ - { - "data": [], - "dims": [2, 0], - "type": "float32" - } - ] } ] } diff --git a/js/web/test/data/ops/add_zero-sized.jsonc b/js/web/test/data/ops/add_zero-sized.jsonc new file mode 100644 index 0000000000000..37e08cd7f20ac --- /dev/null +++ b/js/web/test/data/ops/add_zero-sized.jsonc @@ -0,0 +1,31 @@ +[ + { + "name": "Add with no attributes", + "operator": "Add", + "attributes": [], + "cases": [ + { + "name": "T[2,0] T[2,1]", + "inputs": [ + { + "data": [], + "dims": [2, 0], + "type": "float32" + }, + { + "data": [1, 2], + "dims": [2, 1], + "type": "float32" + } + ], + "outputs": [ + { + "data": [], + "dims": [2, 0], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/data/ops/concat_zero-sized.jsonc b/js/web/test/data/ops/concat_zero-sized.jsonc new file mode 100644 index 0000000000000..7be8e8c1cc602 --- /dev/null +++ b/js/web/test/data/ops/concat_zero-sized.jsonc @@ -0,0 +1,561 @@ +[ + { + "name": "Concat 2D axis=0", + "operator": "Concat", + "attributes": [{ "name": "axis", "data": -2, "type": "int" }], + "cases": [ + { + "name": "X", + "inputs": [ + { + "data": [], + "dims": [1, 4, 0, 64], + "type": "float32" + }, + { + "data": [ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + ], + "dims": [1, 4, 36, 64], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + ], + "dims": [1, 4, 36, 64], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 88555a27be82e..e96a0aa045bc8 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1334,6 +1334,7 @@ "acos.jsonc", "add.jsonc", "add_int32.jsonc", + "add_zero-sized.jsonc", //"and.jsonc", "asin.jsonc", "attention.jsonc", @@ -1343,6 +1344,7 @@ "ceil.jsonc", "concat.jsonc", "concat_int32.jsonc", + "concat_zero-sized.jsonc", "cast.jsonc", "conv.jsonc", "cos.jsonc", From c980149c857facc2463668a11944af3c6c12365b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Sun, 25 Feb 2024 05:00:53 +0800 Subject: [PATCH 47/68] Add log for random exception in Linux GPU Test Stage. (#19569) ### Description 1. check GPU status in docker 2. use stages to make test stage can leverage existing building artifacts ### Motivation and Context To investigate the root cause of the random exception `CUDA failure 100: no CUDA-capable device is detected` --- .../azure-pipelines/linux-gpu-ci-pipeline.yml | 351 ++++++++++-------- 1 file changed, 198 insertions(+), 153 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 24319184dd0b8..822bc559d992d 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -34,6 +34,17 @@ parameters: values: - 11.8 - 12.2 + + - name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false + + - name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' + resources: repositories: - repository: manylinux @@ -61,163 +72,197 @@ variables: ${{ if eq(parameters.CudaVersion, '12.2') }}: value: 'onnxruntimecuda12build' -jobs: -- job: Linux_Build - timeoutInMinutes: 120 - variables: - skipComponentGovernanceDetection: true - CCACHE_DIR: $(Pipeline.Workspace)/ccache - workspace: - clean: all - pool: onnxruntime-Ubuntu2204-AMD-CPU - - steps: - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() - - - checkout: self - clean: true - submodules: none - - - template: templates/get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: " - --network=host - --build-arg BASEIMAGE=$(docker_base_image) - --build-arg TRT_VERSION=$(linux_trt_version) - --build-arg BUILD_UID=$( id -u ) - " - Repository: $(Repository) - - - task: Cache@2 - inputs: - key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"' - path: $(CCACHE_DIR) - restoreKeys: | - "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)" - "ccache" - cacheHitVar: CACHE_RESTORED - displayName: Cach Task - - - script: | - sudo mkdir -p $(Pipeline.Workspace)/ccache - condition: ne(variables.CACHE_RESTORED, 'true') - displayName: Create Cache Dir - - - script: | - set -e -x - mkdir -p $HOME/.onnx - docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \ - --volume /data/onnx:/data/onnx:ro \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - --volume $(Pipeline.Workspace)/ccache:/cache \ - -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ - -e NIGHTLY_BUILD \ - -e BUILD_BUILDNUMBER \ - -e CCACHE_DIR=/cache \ - $(Repository) \ - /bin/bash -c " - set -ex; \ - env; \ - ccache -s; \ - /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build --cmake_generator Ninja \ - --config Release --update --build \ - --skip_submodule_sync \ - --build_shared_lib \ - --parallel --use_binskim_compliant_compile_flags \ - --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \ - --enable_cuda_profiling --enable_cuda_nhwc_ops \ - --enable_pybind --build_java \ - --use_cache \ - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86; \ - ccache -sv; \ - ccache -z" - workingDirectory: $(Build.SourcesDirectory) - displayName: Build Onnxruntime - - - task: CmdLine@2 - inputs: - script: | - rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 - rm -f $(Build.BinariesDirectory)/Release/models - find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete - cd $(Build.BinariesDirectory)/Release - find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt - - - task: PublishPipelineArtifact@0 - displayName: 'Publish Pipeline Artifact' - inputs: - artifactName: 'drop-linux' - targetPath: '$(Build.BinariesDirectory)/Release' - - - template: templates/explicitly-defined-final-tasks.yml - -- job: Linux_Test - timeoutInMinutes: 180 - variables: - skipComponentGovernanceDetection: true - workspace: - clean: all - pool: onnxruntime-Linux-GPU-A10 - dependsOn: - - Linux_Build - steps: - - task: DownloadPipelineArtifact@2 - displayName: 'Download Pipeline Artifact' - inputs: - buildType: 'current' - artifactName: 'drop-linux' - targetPath: '$(Build.BinariesDirectory)/Release' - - - checkout: self - clean: true - submodules: none - - - template: templates/get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: " - --network=host - --build-arg BASEIMAGE=$(docker_base_image) - --build-arg TRT_VERSION=$(linux_trt_version) - --build-arg BUILD_UID=$( id -u ) - " - Repository: $(Repository) - - - task: CmdLine@2 - inputs: - script: | +stages: +- stage: Linux_Build + jobs: + - job: Linux_Build + timeoutInMinutes: 120 + variables: + skipComponentGovernanceDetection: true + CCACHE_DIR: $(Pipeline.Workspace)/ccache + workspace: + clean: all + pool: onnxruntime-Ubuntu2204-AMD-CPU + + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=$(docker_base_image) + --build-arg TRT_VERSION=$(linux_trt_version) + --build-arg BUILD_UID=$( id -u ) + " + Repository: $(Repository) + + - task: Cache@2 + inputs: + key: '"ccache" | "${{parameters.CudaVersion}}" |"$(Build.SourceBranch)" | "$(Build.SourceVersion)"' + path: $(CCACHE_DIR) + restoreKeys: | + "ccache" | "${{parameters.CudaVersion}}" | "$(Build.SourceBranch)" + "ccache" + cacheHitVar: CACHE_RESTORED + displayName: Cach Task + + - script: | + sudo mkdir -p $(Pipeline.Workspace)/ccache + condition: ne(variables.CACHE_RESTORED, 'true') + displayName: Create Cache Dir + + - script: | set -e -x mkdir -p $HOME/.onnx - docker run --gpus all --rm \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory)/Release:/build/Release \ + docker run -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ --volume /data/models:/build/models:ro \ --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - --volume /data/onnx:/data/onnx \ - -e NVIDIA_TF32_OVERRIDE=0 \ + --volume $(Pipeline.Workspace)/ccache:/cache \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + -e CCACHE_DIR=/cache \ $(Repository) \ /bin/bash -c " set -ex; \ - cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \ - ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \ - /tmp/python3 -m pip install -r /tmp/requirements.txt; \ - /tmp/python3 -m pip install /build/Release/dist/*.whl; \ - cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ - cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \ - cd /tmp; \ - /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \ - --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \ - --enable_pybind --build_java --ctest_path '' " - - - template: templates/clean-agent-build-directory-step.yml + env; \ + ccache -s; \ + /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ + --build_dir /build --cmake_generator Ninja \ + --config Release --update --build \ + --skip_submodule_sync \ + --build_shared_lib \ + --parallel --use_binskim_compliant_compile_flags \ + --build_wheel \ + --enable_onnx_tests --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda-${{parameters.CudaVersion}} --cudnn_home=/usr/local/cuda-${{parameters.CudaVersion}} \ + --enable_cuda_profiling --enable_cuda_nhwc_ops \ + --enable_pybind --build_java \ + --use_cache \ + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86; \ + ccache -sv; \ + ccache -z" + workingDirectory: $(Build.SourcesDirectory) + displayName: Build Onnxruntime + + - task: CmdLine@2 + inputs: + script: | + rm -rf $(Build.BinariesDirectory)/Release/onnxruntime $(Build.BinariesDirectory)/Release/pybind11 + rm -f $(Build.BinariesDirectory)/Release/models + find $(Build.BinariesDirectory)/Release/_deps -mindepth 1 ! -regex '^$(Build.BinariesDirectory)/Release/_deps/onnx-src\(/.*\)?' -delete + cd $(Build.BinariesDirectory)/Release + find -executable -type f > $(Build.BinariesDirectory)/Release/perms.txt + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline Artifact' + inputs: + artifactName: 'drop-linux' + targetPath: '$(Build.BinariesDirectory)/Release' + + - template: templates/explicitly-defined-final-tasks.yml + +- stage: Linux_Test + dependsOn: + - Linux_Build + jobs: + - job: Linux_Test + timeoutInMinutes: 180 + variables: + skipComponentGovernanceDetection: true + workspace: + clean: all + pool: onnxruntime-Linux-GPU-A10 + steps: + - checkout: self + clean: true + submodules: none + + - template: templates/flex-downloadPipelineArtifact.yml + parameters: + ArtifactName: 'drop-linux' + StepName: 'Download Pipeline Artifact - Linux Build' + TargetPath: '$(Build.BinariesDirectory)/Release' + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=$(docker_base_image) + --build-arg TRT_VERSION=$(linux_trt_version) + --build-arg BUILD_UID=$( id -u ) + " + Repository: $(Repository) + + - task: CmdLine@2 + inputs: + script: | + set -e -x + mkdir -p $HOME/.onnx + docker run --gpus all --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory)/Release:/build/Release \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + --volume /data/onnx:/data/onnx \ + -e NVIDIA_TF32_OVERRIDE=0 \ + $(Repository) \ + /bin/bash -c ' + nvidia-smi; \ + /sbin/ldconfig -N -v $(sed "s/:/ /" <<< $LD_LIBRARY_PATH) 2>/dev/null | grep -E "libcudart.so|libcudnn.so|libnvinfer.so"; \ + cat /usr/local/cuda/include/cuda.h | grep -m1 CUDA_VERSION; \ + cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -m1 -A 2; \ + ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \ + /tmp/python3 -m pip install /build/Release/dist/*.whl; \ + /tmp/python3 -u -c "from onnxruntime.capi._pybind_state import (OrtDevice as C_OrtDevice) ; \ + ort_device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0); \ + print(ort_device); print(ort_device.device_type(), C_OrtDevice.cuda()); \ + assert(ort_device.device_type()==1); assert(C_OrtDevice.cuda()==1);" \ + ' + displayName: 'Check GPU' + + - task: CmdLine@2 + inputs: + script: | + set -e -x + mkdir -p $HOME/.onnx + docker run --gpus all --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory)/Release:/build/Release \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + --volume /data/onnx:/data/onnx \ + -e NVIDIA_TF32_OVERRIDE=0 \ + $(Repository) \ + /bin/bash -c ' + set -ex; \ + cp /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt /tmp/requirements.txt; \ + ln -s /opt/python/cp38-cp38/bin/python3 /tmp/python3; \ + /tmp/python3 -m pip install -r /tmp/requirements.txt; \ + /tmp/python3 -m pip install /build/Release/dist/*.whl; \ + cd /build/Release && xargs -a /build/Release/perms.txt chmod a+x; \ + cd /onnxruntime_src/java && /onnxruntime_src/java/gradlew cmakeCheck -DcmakeBuildDir=/build/Release -DUSE_CUDA=1; \ + cd /tmp; \ + /tmp/python3 /onnxruntime_src/tools/ci_build/build.py \ + --build_dir /build --config Release --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests \ + --use_cuda --cuda_version=${{parameters.CudaVersion}} --cuda_home=/usr/local/cuda --cudnn_home=/usr/local/cuda \ + --enable_pybind --build_java --ctest_path "" ; \ + ' + displayName: 'Run Tests' + + - template: templates/clean-agent-build-directory-step.yml From 0fcc6fb7601893bd1e2b53baea4436a7a51b7f8d Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Sun, 25 Feb 2024 14:04:22 +0800 Subject: [PATCH 48/68] Add Whisper model in CI (#19604) ### Description Add Whisper Conversion and E2E into Big Models pipeline ### Motivation and Context --------- Co-authored-by: Your Name Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> --- .../tools/transformers/benchmark_helper.py | 4 +- .../transformers/models/whisper/benchmark.py | 3 +- .../models/whisper/requirements.txt | 5 +- .../models/whisper/test/1272-141231-0002.mp3 | Bin 0 -> 92124 bytes .../whisper/test/whisper_ort_output.txt | 1 + .../azure-pipelines/bigmodels-ci-pipeline.yml | 101 +++++++++++++++++- .../docker/Dockerfile.package_ubuntu_2004_gpu | 9 +- 7 files changed, 115 insertions(+), 8 deletions(-) create mode 100644 onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 create mode 100644 onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py index c7d93470a729e..c9c815f01e053 100644 --- a/onnxruntime/python/tools/transformers/benchmark_helper.py +++ b/onnxruntime/python/tools/transformers/benchmark_helper.py @@ -589,7 +589,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None): if max_usage is None: return None - print(f"GPU memory usage: before={memory_before_test} peak={max_usage}") + logger.info(f"GPU memory usage: before={memory_before_test} peak={max_usage}") if len(memory_before_test) >= 1 and len(max_usage) >= 1 and len(memory_before_test) == len(max_usage): # When there are multiple GPUs, we will check the one with maximum usage. max_used = 0 @@ -620,7 +620,7 @@ def measure_memory(is_gpu, func, monitor_type="cuda", start_memory=None): monitor.keep_measuring = False max_usage = mem_thread.result() - print(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB") + logger.info(f"CPU memory usage: before={memory_before_test:.1f} MB, peak={max_usage:.1f} MB") return max_usage - memory_before_test diff --git a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py index e57385aa6db8f..11e596cadc2cb 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/whisper/benchmark.py @@ -410,7 +410,8 @@ def handle_output(output): actual_output = handle_output(ort_outputs[0][0]) logger.info(f"Generated token length: {len(actual_output)} tokens") transcription = args.processor.batch_decode(ort_outputs[0], skip_special_tokens=True)[0] - logger.info(f"Transcription: {transcription}") + # print to stdout as the output for comparison + print(f"{transcription}") measure_fn(args, generate_fn, ort_inputs) diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt index c307a3665f8a0..956922dc83d51 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt @@ -8,4 +8,7 @@ librosa optimum onnxruntime-extensions>=0.9.0 protobuf==3.20.2 -numpy==1.23.3 \ No newline at end of file +numpy==1.23.3 +onnx>=1.15.0 +psutil +py3nvml diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 b/onnxruntime/python/tools/transformers/models/whisper/test/1272-141231-0002.mp3 new file mode 100644 index 0000000000000000000000000000000000000000..6d220f5ede6a7c54893b1dda32b7876c31059fcf GIT binary patch literal 92124 zcmce-^;?wh^FF-0G%QH7^wJGWr%E?ScP!lqh@{}sEse{9G)Q*{NQbm^igdTUKndZI z_xJex3(x+tdF(ycTr=mK*UWi8UPku={_kPr#qnOa)e*gHD8x_fxN@%0Z34hf5hj*b72oR)#g z&MPebTvl1r(A3)A(cL%jZFpjGdTw#~$J*xh{?Y0Af7ds+_YeQBr=>2Vp&-c14}+m= z{hujh*vx223;^KzlO2C2YX<-Ci~paWAD{d~AdTs4@$bUGGEhrU%fcZx#YZn(Qjbp# z+bpz%_%8`9uqmFMXfc%TZQ{JN&JHT@?Rj>YE%-S9$np5dYx(#WhuLU$!KHdZL^`in zyZ7C+3^Y_PWm1iq^;?G6W-ByH+mMe;DMxXFqe@^?hqK5s!Nv4mr2g7ZxES5E0 z?NEjUr%XYGKhD))YNq4gr?urAXd82^MyNG4|}JMwWF8?Ce8$IF;>*A9Wv zBO*fem^5K%ByHKu9aDCr$v!t;j=LgvtUE{iet)?e_E=bS2%`Oe0prw0UGy!!{WV;B zyj1RArUxzCTd2f-)hDcZ8DI>^qE~qN`n8ljx37i7Y4b_mEw@6<+fN)MIlGA#cW%h< zZJI;zj_I8E<@(=FPnRDf{{AfYcHfykYFaS)YAdf)toqyCTm`*Wncy$PW&Klu_Ju6S z9HQ+wp*+YhCcy;_9bD3%R)mr<>fgPJNi88u0cSx};5So^i^B{gPxfrE9NC*#_$OT% zXE+f>NnwZW4#l?K)rTf7cYX5(W&ZbEqo@P2=X1CW6}4GTZQ9*`A22K5f59jEMlV#G zcvg2Vr5!R-GT-F(_WUUZ**_LR*A+WH$fP27Ic1t<2!`*Zz|myZ2o9R0)=7QI^U(!T_}CV5q4btAjC zuP@0_U1rYNF5i09lC3{NF6c9AdXX=vK3~y-TAs?+XdwI2IeFZDaFMExP7}Mo;f^1( zzs$o+>x^xj*%E3|4osEbvdpiKNk}s~bax8QkKS8MCi@b|sZ`m>=H?wl<4}FU>$W3S z&@h46;EjJKCrI_m9@D`K97Ts?S=l`PdrjF^U+6jJZfqtCx+7DgnhM%Mv0>tiDx#i( zqiPwW+qv`RtyhwA11;unkN&!n+x^ax2$fL#6j~f6m!RZen4Lu`f3; zZl{&#eIwQM`>ws_efC?PK=&H|Y9szYXqP~>mGdgml`zk1_qfbAJa>^^GZ7AEcTGZi z7FpU_E4Rnf?Sq*71lR8?ncYEO-cX%Q&IIe}wST%nIX2t3U_P`XVzG~^nqRg29QyLX zPq0(FE!oE?rSz@Ql6rl3kTky@)-5}?L{T(h5v?HWsQAZi_beq5e$kHwI^I1^wp@;S zMbwNTar)7$8d6U-AG2?6T3c9u+xZ#Cp(4xuIgTOAqg97w@|6;$M4ECQL*j4&>3><# zK8JN}J~6KQmu1~5@jSC5tSnd(SFR#DzEMzxKS+yOwp^l{l5oTNW-X zS+!uOnZFe`QV|%=*owMdTs+N#*<$>dP*{K=6>uLJ8daXWW_0Q*{_lRUJtu?M=4Sp~)O-w=`*#AW=f^I>F;DS)#-UqoZ4scgs;#wUTf$?K znk*T=Oxip+2OS`d89L)w&g=U1U=ubtuUex4>Z9vXvXNy%X) zd-ytmqbr3!NN%iMglZ8bV%6tS zGbW}|y`amq<3F%Y67g=D*8VW~b@%U8_sdG&U8XVW352GP#mrz>6-=5O(_Ro?(!#_r`V}aogb{R zs^+i0jJD3Yo*drjRl0E(k5m2UzZpp;k3mAnAxhjuvAo()9F@a}!ziuFPN>P&)WTKX zhV}vVjn;O=(-W*R+*o@U*_g8&RZ|xYZj0jgy|)Cv?fYZL^z7~2XwKZ2O6@7#+%wX~ zJ-k_d)Wxg9Twz$pveM^Hzu<+wTpCN!PIq0@aPfA+M{;@9I4?EkWua*L4bs8E@Y;$c zoUu6WXS%ZiCwEMEPEWY8z;icm|KR`8>17nIP+<87?~1D{UF!fv+qcNnv%uejX4tW^94k-Il{ zX!tF5UTw~{+7euj?UR=A{2%UpRg3AX?{Rb9Rbdbz{WuPG{rV--x%OsjWzg&;eR{UM z_XstPjo~HvONp+X~O=C&qi+;yhsO#>KYtR4|wEx>UA3@1;%qwl1+)ox`x8)&RJPs ztu6V{AO#AgiS&JgnD5P78KEqiRQ-o18OQ37-7&U3U(uYH#1^@Nr^++}~t zs%IC8wYD^sGs>Co=T@&r?=t9z!}IQJOPMJ3%)%Y(1a>tv#TZz6eQ291pKRXdKs{Jk z*bx5RzA|jPFKD+qcdJM#sxZ;+4_XLH%qWkHZ`7@k4Q=07=oi-fQ72WM0{d@QpVJNW zyZm@YTWUaeMcp92&U>yOpI<5Q6J}q$@|}xM@T&de)E~%;vWfzW6Uu@eg_#_E-b^~H z=r{{WV=a-m3>EW)6t-?@5`AYH9S!+>xN8Wg&iJq^qPMKWU5bT9rxiuHIrfSgFT?CA zJX+mW>$h-cf1Uit58HSD&*nF$*1-W2)xwf+75m=~_DFNem`T9?kJmkp99 zUH)v`MRqV`zg_nvGcgXjA4&lE2go9auP~j^!&j&|M?p zIklXgM^Biib(MG%F%TVr8{!!PM`IeNcG1BMrjod|9|S?^;H&^=(spB;R(G^qtN7A&-}({ikok$#pDI z=Kv@Wfenxk(U3F-L<5%F4bTCkmEE)G{*<{HLFoUHPr7@_@XbtrOeQxf@%Vg*jWDnc z=KisB#cU5;XKbIG?Iy$s=?1{TAOHeD0E|$@2c)UOZ~^&nYyzMMA(_td#1Dhyzk^k; z3KKTCW)8@*q87vJ+zZ-8h=CqZ5XCx_5RBISjs|@L@Bml-_g63w771Ds1q>Tt1CZvV z!$<|ylEHkjXYjkl_*DrTFre!Eg1BgajcxvRtVX z*3a6}3=C^P9!$rH_~~D@}_TH+%CwK0UiJGsjkom?N!>Q=&}0 zz)o|bdk1PY%Y;bU@pE{5d~~LY6eA81(-GHB9$EYEeg3D=jOj=ie_WZ`+)gPfLNH^8 z*qPZOmf%F55M?qq;$9{N&@^Zo$*y_B5kfBXl&7t1Ma1>Nz4AXgeYyDAIhE{xC?2O& zX~XLY^}1t8Fk^}2dZUnz?_fsh=0*B3GOr!TQf;SPLI;$hS%UON$B?K-@E=- zmcm>O&M+y`fVp+YXk025T}U>tuPu^T#^*&aXN-inh{kw7NT>IFF_dN;Qy0B*9QpvWax<7Ac@uJIWF-6Kcd8 z683ocrT-*oj690|(>b=X1yVDdy_60@4-5&b1w%MP8NvWSBt~d!Xdd08%U&h7*L(aR zQy&|j%b>MJCA()#(QrMCT|l=U0CNU_Acs8z^3 ztpFzL-XH2OR<{k0A2&m&1ET)xu4Eh2K!<#<{SZ%q^;7K?-qNxUFL0ukNW0lI{0km>!j41;PO z2LD039H$dIx7ohWC9J&d=tR1~Yt(07=W_jqy?NWsz6T)p4i~Hj1IvW`?}SxKc*C}a zj}{y*KMS51uE%^py`HHasn0Kby=z)h*}R}Rlhno2VD>nX^Y_+kMeon+@Xxfb_H{o| zpD*=a^!o(2v5||@o~@-k*sW!(b+qh$T>Tph zvX7i1hI$w?2PNi@30M?NV1ieiD7T2g121}%n(FgspT(^DryN}8^Cq5n9D@UD&jo9n zG&rz$u05gBp-^NtI+#=WfVGv{mt{c}d!D`Qo&0*YmfrHgf1;-W;w9RJDM@{}boMeE z$@%m4hWS-*Sl-(T#2!89K#Z5mw{_~})fs6W7Kd&F?cnF~5<2aaId=aE?4~CT3hvm= zWg3lU?xUQm=XIWOMkzrC1)A`!%j;A*E6eyp062Xg{|1NdG}iS*C9)4S5QUKQ8r4~> zB+AzEv)%T&w~{wHJp`e++T-x?6A~E*iL;ibwIvl;n;iNQ3Hyr@As?P{@Ptze)N9eJ zdNp0(R1pPd;Y<*x3LBGB5!O&}_=d&o4@Q$={ z?=8>tj@p~-lV9~NG@8jWf9awEP3HStZoR+r2l))}=JYny*lM_(?65V9)PLfsHjwh0 zpU*0`)OdKzkSN~QsAm~(WUnX11OWkZAy6YMBmj;B!3&Xupd&~iI5I$_3`>7DM(rgb zA^@(8@PXrmxWoTvjbHMLLHm1Xp3L$ zilo`o6ZT{He`LamT(^o1xwYiBN&iia6ubozpjZ`hP#PzM30ZnpO^^Z{^Q&a~76`m&(c#gyVO23w58P zLm@C_BJZ&vylxpIGf@~RZ`W70&3Vw~XQsG9{C}{3(82s=;=?MuGe?f=A zxXFmZOYne^Km;DPH2|BGP@RAa!4feMOF51v0j|SS=heo~RgY&Tjjt9S+lbKcB?LQ@fLU$g zGeGelDaVhb$GO}G5n;A^sX=`ybt&?SocOW*p|Bxa11j);7)T9Z8#6>U#6Jwlin2zF zK}b`eqvJ>uA!&dNk$-YC9nfz=k^n@2VT3d>5-Wrw&O;k=7NPA(J=7Bc_YYmyDkXJ0 z7IPB#&n}M`g4Q%JMXrLGbX;x2HAI33XaI%ivdW5hEbvDQC(2V87c9X%2w{tg0yKQE z$}av{M%UsBVqj0VJU`rs2ms*43&hL3aRX=7g1@4QI^Dm-xS~h`*{4hVg=> zJ$Jl0lGREsE!Gc&STcyOhZa$MQvVu+f?9fyO$#}%DO#~XS~Gkhl2D* zhVshrh%G#YgL`#6T8xddd^dmH`Uk+dm2~J6?mKq8-=w_K5p*goJYxv4OC>!Bu$+H! z58G1-6J$|_0s6}H$`A+?N!cA&J;a@QgKR+~<* zm)&{AL7^k@FImCj*fRN^p&R@O1s9=t{$Y9f5d|9|vK*lRnXsPBuz`&@#*MgP$^<9R zsPWcL{F`({W>G}G@dBEbHLGz~+qF4R_#9avCc_Mha~`%c98@&au5eqGG2n_Vw?|^a zDhwd#*!ag3?3o~Dyu2MPh7bpw&AKl~V_DO zRfTO5UdkNak3+y0%IVsfl4VPn#>FgE7}Qh^W$4DHjFzw*2L&f;2yaRLixg~uRiOhx z)?L=m$uDi)Cierf_-xX9l5yhB~6Bg$#n*0xEQ zjOA@naY0dvfU+}Ofx!2DWtnwM31+RE+c2;Ppl*7oye{r~47ewMsuxFst^{RWOVL0b zuVZz+FCJObyIK$lQ+F$`0gs*}IdOMG_QEoZ3PJwanLnTkT$85Wcw#WN<8QI)hQ8%3 z`OtCN7>xdc-u`IlkJ55Vkt)YwQn}i*&l`7E45N~%{z^h_M<22TXZ)TK3YGK3)yD=qMER1LdJwA>Z|aOfe=$pHo^V2g2rqF9p?0=%>gA z*aqILZx8jzJ$b`7+fp{WUvEB%b7eoW>~^`N z!6yzfK1{I|DD&TNX0@(${o~TqfhTUorBusBB;#^+uSebqau^wAYwFK*91@NI9uT8c zBnY^N|G2dDMj*G6Ajh-5f1J&+f0bWwKIUNtUxTmc1zehvke2|_!%U8gdAD_L+w zfs|%_FO{(^s#Eb$#361V_k$@>$YOQhoybI|JQ4#?KgYU6)sTo)`+iA^71&>^Hf8+VKSqgwm|GN@|hp zHKkghr`SuJp1cZWffRHbJXPBx&y(^$$~ND9;|XBAzmkqO=-xW`jl8dzXC%LxZer}u z*RGd-xwNwqxR>?*_Hm(x@|c6B-i~t3?q5&XuYc=>Hz@k&){Osumn*{MUg(S=5G?ru zTQ@d&FvrVT!VhlqF1ptQdS`XGg3)cStolGD!CY+l;p{?m+r-N1{!FBAO>xm2?9a6Z zmrvb)z4Y}M|G1|&sn=GwUT&$w1ztw98knjI86=-gjaI2%JmlG1ZY@8;tSygT`Xk1# ziglNFC+|Ogq%J;-o|`Z8Tlr~QlK(AvTlT>WM=s|eIkR?t6!FiV*1jmdb(IdkPxX>W?JyHu;XgiuD|>y)*GdBtwCxJ4^cG^AD7kmW9E z+3Kkf$|$f&s>B7gV7@jiWy4v1KKqtUm5VT|q&8Vo^--tA8U=Cj60MV7xF3q{D%NHDx0*+U2_}7FRsG?ZHJ^eRZG_P zduel}NG2;iKAo4>lx)eHe)f9&;NlG%e$EjC8@3f`hdQJqzok$Q4G~33rZ`L3yOg#^C%P(`co0n9@Kq7Z=P&-L ze0MVqrplwr%`i2c_upbGsr$MqpZLYYpbZ^E7=bUViSj{-jjMkCG`|3 z3kSbOQ(YUrS7v_>Hp|ZXN*)+ev{vUU+j7_=?j+RT)|f(CR;bf>ZOq!%sPg=6siD+L zd627t`tQa|ei0jjOS4v;`TAY9I$9->pN=8euq+B)3qiaT2z64du|Esx<{j&mJi7OyhnObrt z`l=GpVlw8N2zZHpPgvv*F;%g7=|Vse`Z;?124RM`Z7~vL#I7bx9!ET>g{!YbtU8oj zb(W?KX4SMT6GLxkwhttpzZ8_()O~!co8ea|d||;^?(qAnI9bu6%h`*4hUGVVy*STi z%j4ri#bfTH{^PHe!>$L1uQ$0rgB;ETR)inkr~ma&)2vn;wARflEt{NXH{ij?!LY7_ zD?^C^)(FTxm>utxOd$>m>O>IP3k4S?88}*RnOnN&~#BK+N87%@-|Sc`@Y>;oY1833rWyI*&GwSuz~Kg!f{ zF?EL2f!$Jy=!i*9m@dX8I*f-5EfwH_aZ15E>qyXWr@rz1%Sn&7N~S!c7~CDN1b{%= z;NoD4$zxl&g8CVr_|K|;{J&mY_Bpm0H#;j3ZUSfK;U&e#P}ugawK*L)=)C&Dw!CHL zXZNh{eMRENhv-oxG9)Jck2z8@EIFKz1387AAL@Y<28O2x*A;hcCW=Gw;uDR;J5UqS{bOlyxdR3EkwEi3%2h|6S@P zSMKvHC)_u4h+I;c@dJxI{*WPM62Igl@5nqWQL!k)$cub!H#%xgh6b)^)axYit)^ib zLa=sCjAWKZl}dg4H&o?E%I>B|dhb^~rkwhfnW&HH;(8e>4fOy(m{p#MXV~dS&AkI8w^9hNie~r?428!{ZWgmQL|uXKV|m7i7mb+mfk-iKK5mF z4nT>X?j`1#+H#!kZ?u?-pUTm<70ipq-fRykN`TwkE97fODOC+yjNiFAut~NvXB!>? zq_q1X_>2h?V(R0o=Ci*tgMQeZ9GU3@0NYM{8(EmxP@7Ol{;S@gjInPKROk(TQACuw z!n#HVWBu9GdX077`|9$S3L_AW6 zZyDwmFU`lmzXS_L({q^wrN`y6YZiC%b1SzYzs@51H2eOLGxS7Q>-t({ixXod>_sw> zyU8VU|bi7~84PDo7-& z(^`L1E{#y1JcR9=J45oVZf|SG#s$4N)g+=AOQx?&?k(!P>RgvobsAL~-j7UIjdQ1( zH2v3~^^kgFNY}XhP55OWU-^8~?8~*5fj&H1jdSkZ8{HlS{f^crtgGbI!<{f)lm;?= zxbazDV~x^>8)^+Rak}D@4ZL3Yk0p{CEkw{R`G6msgr+0aXCa0spNjcR^*8Wq7+6&p z7rW-;$NC+wL6&WY59SR{RX6^<0Hnu;w-=GD-ByZA0cw70V@D{tSu23I=Z~0v#}6L) z1GAtX^9#aKzrw;#1>DTjb#`^;l256e2$by#hBBp(t*gUaZQ5tKg7o_iR$KNr4&$Gt z>G>V1#-daX>^@|Dl%aTtkd&sh1Lh0s6%&Is#Q69z^JI%v_}2rWb98x zGTO?e(a3hOLY>OFvO8Ai7^td>H5|&RhI%PS(81W!O5b4!j#QpHg>m$pv(xq+ch82nchjF9lP;xAoiNOT(Pbp9 zEI+LX>E)_F0B|Ms9V-dISyz6o`&AwKRk!8JB%=7&2o>|^UqY#Zwfrl@t=$>I48{~Q zu}>Jj$XSW`<)K&Wf?7%{9H|^!IFF$hDwAxK7m3Tcaye_aXR!N=Doj^ZwQ5==GI_g@ zd+nQc9#ahHSdbFRg-1A<85u3Jxt--;%1kE6oceCIDsElNgp$s#)UK499iBhiGuKM= z_pxj5SvU>L0sW)UeKc))ldg$kdAam(Uw=tOl7iU_XFd@^dD}zQ!@GcyAIn&?Txixa zQ-8K&G4?)+%oQeY^~0yh%Bp}udr8aQ|EoJ(S+}h@0ZO#ZO^hoCGc~#9myOI#nSLX` zTa8`N(CE#)>l<%g3p8l{T+1G`H0sz9%s8;ewLEEP%DUnbsiZeuE0Nz?R8Q7F!tJ}i z6yVLRXK-TW*`!1NgR{8{=BM_#7AAWZXT#RXVKf)+Mch|qksHK2vT6Kxo~^D&u;TV+ z<)SjHv2y*wdZY&4`+M$DP*A8?!0S`h{n<=)-FKs1ZdN+QWUH_j9gN|dCY54O{BTTo zjCjPtW@%eY%&WiY0)9@)!ZsHh>k#St-q;<%jUHvi-mLOmoL_?F#Z>xyHnQVDO~5Nq z=|f`G8&Vea><;dnsms}^-oqvSLhb1agJMDr^~CDorAE~3|Y?l1YB>&U!>v+My;CW|c z{UM3Vw*@x|>lc*?eDs>vBm$W}2?k55rCyq+XZEW>Zf(8krb^?lg${~MuDlC94F+{` zv6)5WRi=A)$d#4?j}}{O|FfqozcD5AvOVH6YJ{*x8u$eU*{In+<<6Ff6nPhoHGTZn z68j6Iqz(Uffw-G)cSzbA|Ke|P*_@nn^FixhU}All55LJ-?EHrAEIMi5CBioqn7Qk^ z-p9#(7<_nPH!dp7g#?R6L}_G>rQLm;?^-zRlKlKgJ;4as+fX3bRdG4dq;dS>&trOE{Z==kI0gL=RiQq!y;5-9gK%ab$_a-TgJBJAL&uO0JBhI zJ8pNyKQdZw*9nxb^+Y;K-knzrc_pWg=P;$ioe;39fPo!|KNYxVzrRm`C^ zQOsfbI9tp&Th$OCkzUN9COV>%RxG%~p{n~5Iml}R8RS=`B}gmgP`^zp<{B}0+!f7_ zw^9H1`Qsx}ijE)ygkz%+{r>UJ3A?w+c1lZjHlhFP{L37U6BRQ|*QEkG3TNg`t@F}^ z`adEp(Mbno8up&xe+KM-GJ*w(*$c-Aa)ymmhyD+rncv*j(2ubL>X~^4>`{v1Z{LR4 zL`IUde@S<4CJZgy6Ih8*cCoitJc|uFx()C!&yjL%p{*$I(M(!kpA+j( zu|5s!c&EiwglZ~fgO^5h4~Ns<9gv{ia! zO5LEVmtBwx5G&HNgN{uq5T|k=8F}jjp#ps&R47J*+odBe^}N{Von>rh{ykGc3o(T! z&C&_`C6xH~)6zW{Dt z9<<;R8NR?Co*^#P8S*x)Cy>il0iP2!!V7ha;UQv99C-ae5fP%|Q`b;c`z9y@{EyCi(g;j@ieW~n@ zHGxP3p7M`q^_c%xONEK2&ie0YvAW|{xR76*~#IYu*!px!JSCdd}RRH z9%lQRR4Kc2CE$TTf2K?WuZvV4K4CQ*S_rdNbbYQ?7D0|aPvGudpPI^G> z-tq8Y76YPnpXP1bGecf@G{H3;)3MJCNLM|>qwrD8PkvtDDEMXZPph`%obmp3vj<}H zpQh)fX|UqsbX-+XCQF*r$!fw7rRLwg`VUg`Bj4{w@7*gVLf|&1r!59F*JrikM|XaE z7454&lDR>Hn7&BU6~4-!VhuGWqvB{4f&%s~>aVGFRn^wIG8Xy#KU-W1wfv--UlOkT zrL-lz?E{q#iMy_NTb}GsikO&Q;W6ss zwOZ_B$xx!6{wS@#wGzs=O&VMT=9xtHOA9 zQM&BLwO02_!k`5-2U%GqlbJh^(C>8rI@VT(_)0-BWYi|B92W2_R*uZh=QIr;H^=$= zHqOdrbLZbU3;cju~u|+nb|$pwnF$Lp#^pLZda3a;d|M z%r6P1I;J$It_kXtJ~4~a@D&)m#2*jqx-M^a4kRJvThyz#W+9E}gT*YpGxWsXV;N^f zWwED;otb5fx@v3AwlfAXenl{GEjHqyBZ>bAU-R}d;6s}>2Y9oj2#D+{A>ar^xH+$` ze%qjVWaJzqt)o?8RUyh=J-MwoMO?6)6P_e=?u%LNU@J`sq(F2Dh0w**!1hCc&lU7> zNd)AuAu!oV3^V~5G?ev^uJAA%Ds_Oq4+Ng?9}C3f#xA{qabPD^<$vx|ck;|whiie1 zdA(Cv(WT)@Hg&42=32GF=-`MF{<=vnc3qv5;g2StLJ*tM@GlIj<^@pp^LsIZ*lLMP z-r~h!`T?QJv%+nDkcd4Ix36V@QfN`gOe%+4Y~re59eX5%`K6np^*P$Z@k_N&?yXGS zMw{^hL%4v@l{**gjXeAq1mdYzF#3*}+}%HO9e{uX*s)15@IX-XY*!Vb&JaK$8iouM z0M(lOC%pNB9)Sx;9mL{Okp@zYN;5-Xi>?@$0(n|<>zg6n(FD1;*+@|N5TsKM@OoXE zc0HpN*Jxr47Y_>)48w#?Mza$N^0W8bPx2oFh++F4-O_}F3@>9qjA!hOreDW8NFMf7VhBPzg+2F z(9*sTF%1PT6<(F+hIm-yAOMP{TEfT#01ZNUsfOO!XeXIjYl*{Zv$aUW`yl8#dD8N* zQuzU!2KvbE0cmLXw|q1_tT7>60cWXk#;syr0zq@Z+xvdd7IS)N5fd)?n2;6!B=>~u z3^)NSL81}s=vZE!m3zeIg2#i+C7&eki8ef+VrK0+`Wo8#uACxxW^9~mwN|`AG5P-A z2u0iv#)H}vqL_3XE7qH@(O-!8YQY!g^4i$$p*gfT@Uf|@>MgQ0?Cbr4QIPXv!h+6= zg-46dMQg7xvR3mKmhRv4iqz9Y>@GY7lj?s}f2ax!C{R{Q(4ZC&Q%qKHT1*`;*O!ga zSo>-c_f>s**?Xwf`^3&dMV&g_nIJIJQ;#)ZUu zTaH?0i@@Sx7qnko?inN68WJ_cU)vqRy)NYW=he!PntnEp~O}j`S-8oX*%Rt(% zf{jo_zhc$u*uWJ~=E!E3fv39@>stapCRIoLd{iVP&xqFg8;Qdg>Y&#_iissnlK8A3 z^(V?XS#q`m!HeD}AL;unqfYo!BS>(%dvZ)zr?R@x+Np(m*|KbYlKyuG73FPR8%HN2 z4jDyndX@ZW?X;b-T=Icg>(X$mILYs=poE#zuYctio^bVt!7O#jPu8TG$Rv zA9AWUJGH(l?8x9BRyzO>&|)2m0|yue_>Q|?`mW!RAGtpi6PJ*#x8aT)Rg5gW9<^uY zE}#j@QQ~QnzoClmA!mNCWMSWF5Ipy9go!dHEo1M69ls>#Gs|v~fyV1t`K0vohj;u= zonG_&F~n+dQQd4&!{U1w6!L4#lyEk?%vMR*iT&47&te|!D! z`l$3GAs@b*O`Vk6@)!(Uu|(7hr>LZkZ+QVnGb7&C^+4fYy}Cc+wnoRDulvjoKDDPD zXnUo4lEy8(-71*UXLyTU??zTABqwe!=5_VL{`%ww<-X!ed!KV_67c1!y7@8RuR~~resU|*Q~az5GZR<;U4jF=2D5>$B-KfwR9O7$ zDMR>psBoD`WO^7ByMF**96B^L9Lq8s3omd|dFg4*vMm~_m5o*8&!Cm8Ux+lYc+L}T zZHD#jLabwSZiD&2a|YXrp!c)DNl>Gj*v75(H0(v~zJF2vXr(|3W21hG3q-$4AdYUc zKu!A)wtJAl_aM>grBrG0=G}au$PVLe=X0+%F6T|FFBzFJ(zgpW|2pV=8&$3Pznr-% zjD|<7ZCs4g$W@V|cOXEt>2XFothfV2X%r7rmN*845j?GHiudlY{9%Rgq-3Fb*D9XM z1EbpBg`3yS%G`pvU?q@mlEbY1q8+cG2rcc zk-GXvlvkrx2>Ir82GBcKZvRO>^Al=wkS-_UyGv@Re@9dcd4$%a4bDH#|v2 zS>3?dK|SY@+(FFTUsV(9_Mc}C3zd56S1dd}ltq3ZcMY4;AWzdbIrU9^j16Y3*4b)r z{al$w5N?FZAq&oFjf|sEGnQ4QH%yFB`iXUi`+N}4>LqO&V;?AQr2OIs>bX=Kg?*G{qj_dv}uZ7zjH4n z>;H3t4r}7j6e0U*)6#&#RyptDS$`RVky*f${Wg8zr2OFCt);&`+_BR2QhGwL^2I;} z)ns-Pra)^9>QZq~xa;8!>hbHY5H{)euTr|AxAV^hyMW)RI$B~aZ-1h$Ixen0ovvK} ziNMd8v>3;|iLA1|3H$A6P~F7#ctm}>T9v@UWP# zC;$L$tb0*Km{C)dI6{s#@p3EM`YFZ^g3RiliOx~^7ahM4)RoFvJdL(w#(KU67!rE*7n)V8sa_;_=riPM8d zzB@yAfTgb86KkVVj9}n1utEP`m4+HG%({-SR*w<<1_%Jj`e~u{t^=&EOb#VYxPQ>FjMgh20GW0h*Pf5c=IG6ZoD3NeNl zM)ZWRGeJav#$nP5pW9te6?3DWIPG*WH+g|&&N<;Gzdb=0KzH6=$0#gqA)Y!!zS|Lh z3_GNXE1nqO*sX^dPa$p8t<0m2-@{hhiuOp|D{wMM4{-~cE$A}osJ0=FXsI3!V>d@a zLgD8c5PYDYn6x?w@iiJB*ljIbUK(Lz^!$@+{a;#w=T`8sj{UN_l7ED&XRB%UNB!+)m4G?2SbzTy`GH8W z430!&vdmwZ`PGyl$~C&OaR2d4K*B|`lTD^QeN5bB2jrU3xGJO*DT+o^`sYU`IE1Ji za-8Xj(5_PbOjaa|CeAuVA~`=i{KqMuivPr$#+#6o0m{jETHhE2MKrzOoWKwO>CBeG z8>f}=E$t9g*jryz`YT=KyFExVnIemYDIw1UWjA{Etu2{#lxGj#{kRDU>D3S-EN>wG zrNa2Z)}kst!!`)*fV=yv+gnPy$GCM^UX|@TP`~S;gjoF9@LAl$OzR2bv?Y;T4fDN=z>goGxHgkLZ{3d)b7N>zsKIZ z9H$LGwK!s-elF*ERV-ht5vGd(0|!egsHv&xQj||c+AVo;_I3T2z!4f|f@iv7~5L?>X`J#G8M*nSZtQWvZ^Jo5uUc z$8V44$v>_CG7VExlcWj@3%f-jE0Z|^nAd^tn_Tuf*PN@|(2D5A;cteF1tv*e1Ld}` zb%7WV5@n93b$#b7oJf0VYy|KaNBhS}m+`)-#G567nXLgK2tdGa4l&uN7a9(%fZPmS zu6k+#l_jU`tX;6lk`%$R8B~5=ig7Jo5vAup9yK=$7$ED*sI;%e_)SiBlVx^%qsPco z*!iQ6B(d@8XWRT&FF6Yqwsh@&_R6mK9j??1F#x5o~y69CHy^5wsmy&?MBp3s{?Qu3B1o z{5NS*4pa%BQw>m?J9s*SoV^kvHnGea0_=Vcbox`=Cf9TC63_5=4msCe2IaJa!^3gs+|l);58sN@rzi#ie_Ndt zTQM4PeA&=MDOYA-Bq$GHDLWnoU}zzBrCdx|#0E@vdjs))Ib)AvYC5#_CN)BViU{cu zS5kG$h-6NBv#h;&IxOach(F#)4`9V_^**2={@sv|$YtOn4sQj07|2gu{3MV#lHtkv9}ZFd#xY1>vpcmA3t@zXnWdpfz=m z7~>z@?`b3lEB5Zh)M6#{B&bxhzV652aJg`B>?LK0N0({rSqJ}5$M`;t{Da{?9P0sO z!`X3J(ek9=q}X?#^FJ7ST9!;naIm%lIx~JNiLhg`qUXVJ&;)@nTw;s{z&axla0vl@ zjuidZ>2Sgy{?a$`pCT42!$IgodDpsT-1NWppoUDCAdW5&o()sP(#1$(gT29E0%*Cx z^nmN@C9vmc9x!qwI>-qTO{z@)n-@C2G}?dNRx8N8Wf=(_*5DH?6V~9bEFa0oENP!rJiW0 zJbOwRs(ME~%}|%?T>59&-z6 za7KEZczi)~1AnU8iZ&?rs5D>HA9&p`Zr-(L+2nH?wBNfjkahjiJPz{cM(G!(MiZ_-o(#_m*+I?m>4cQX(Jn|#=$E?*q?!zcFv+oDW5*s8{3N4h-iRsO#M2O0%1+R&ZFq+d0w$J zR2Zc1%paUJGjhLsJVFt2wOp0`M6gO?M22&TSx1FYg+00uYa+>M%vjaMLN61)ir4&- z)3>$6JGj)k(?vq`_LLglcmFex@VM#BewMd~X83!!r%RxrkI~H4`I*%8cKIh-jp`@<;p4hz=$3Cs)H?bb2(fwN8;7oHK5St>fgNZ zjiY70Jcw$h;#*(m-;9Ovn~zCMV@U?SeKbG0QEgecnf2PwOD7%ya9*8h*8#WoV+l-;an(?CLRFL0Vu#zZ0d%K z+%uYWoZ$A@A7A+}W!9BU-&%&F05+A7K@zz@G%gvUhPEKPOL%i!5FN({SI26^N`47% z#P?c%2B!xs5^%Og+6PrctZ>CB@X7SpVUJORT6?2u;zh&cc;o=ja-?q7s$RUx?eaEq zBrNq96F~X)u`!R)qJH-TVvSLEcR%Fknb1#EN?mzB!F_@B@rqbvvkj>t&@g8gb%z3# zGi>XWg|lkS?r3sN{BBrJG9(_>V;86tPuHW^Ll1-ypme$ou&dVE^bJDg-sIO`>h)WrG){*|3o;!FnSSf;pNmYE&wn}f&hcSvqpDI!`|cI(v`KN zpK9Ru37GX3H~L&ujtHR_OO^xviPL4mpMcN7lj3kCun&G6H{2H&$;AkJj4c$|O$MvO zO+nQeU>qXUA_7$~rV-!6F*q2@@RPgisBfc>?xYjDFz5aUFII<@4gyOh{l_iG4^mSf zJI)3V3Lp%e)6&o{anO!vZH_r^ZTtTWOF6BVy%WUl?JJSt@I2?3}R z1%k%zq5g1}!9k&|zI*H)0EYn(!UfQE)>QMXk2gT1qY#RsSYYLMT-KD*GyM8AsYAly zSWIwI0u)_&$|f8PCkL=h$vQki3aH`@LV!j{0G`H8HyA*+VIh;i&R$+1ivxsrW5{O~ z3vk1`VdbR5a{9NdeTx!6>xp&>a z%L!fiKcUmd!qkJGW{URf7Cy293!dPj{&;HQA>4Xk`1eHGz#dhGjbS?z0h+UemCN?x z6zOP9AVB)26Q(7hgEa;N`1V4WAw+#leWLgf7%HSYnJgl#S%F1x7i!kS5nA*jR+ryM z;bo@b&Wq9F8vmmyloQ~IE2dG0usc!#AcYhFuP2jXgZ7iDhFFLX{at;1v@&wT+bwe3 zj=s;MUp$?hRVF?>WGd-+HPtU6JBE+aW=k9dpg52+Kzo1#(LEt{9SaTvh9MweEONjk z6arAZD(v8SWvYm#geVXZo#lEi*zzNF7uW-Y>AglbD%URVi*0prpku(H<_H7~=ML z;`w{nstG6+06?n7?`0@i10mtnCI?BNG_{O{?#_8yDLnt%n^P9hYY;7{1x z@f7enC7?|4wD3)AJ^Iz*5!)bHMsa~R@~ULyk;M7ph$BOD;z6y>;+xVxvx?V`RztO1 zb=7>e2XfTxhDTDr`nVqd`t#?XM{N7;6=c!>sK|JB_2dbO$+v;i3qcI#Pn6NpY`_KC z{j>b=%TC()Aub-&wy_qt!S^lbUKb*;zlxupIKyhm7{J(N*YTs~vY-1NLB|-97Sb;l z%XevvbSP_Zt7FDyMw}wB1QLm*pA)Luh1gV=e?|7O^|KXlcz3#Q?^3{1Dt}|cLAuI| z&O0wuy;WM%yef-OY!C|E!5Gk(c|4|*^sO^*^ke7GZlAti-hP~3JQR2SKxrHd40HdA zT4m3hR1xDG2oR##LEFCPjkh|`orv;DL(jE{q9g0);@_K2nYz;R>JCuO(bRKiWG10q z`{7uOtKzB-ufIA5xV{@~C~lYhXk7?U#SKR!r`@)e_r*~uBFeHSd`3H+)T-g`izy5JcJ6;`x zBi7M!ez<6YWAd6ETBh*WH^lpgqwe6fb3kU6;!^+2t0AsUSoA$tF|dC^s+g)~ZG|!9 zQ;tSrOs{fMZQ1)no?14+N3~36=(AAxrguznppK?HKJAogjUO`y$1}H&Y?rWz zVHgve0V2n4;5x_O@M#^|(p(_G=l9!MJdQMhtqY?onG1tX|2=Flle(3nL-OXke67j-OIsQas1afsWAmC4sBdS7@TM z55~vmvvy?T9pz~<&)0Gp6(J=}G9shrC=*U(EI$r%)p~RkqI2lL{+W{c`J>qvUBCJq zXbV|}FN8ZnZU)_B6>7jz+DO3UX^J~BWoS|YX4t7Eju7U#K+>_f5kt`0r*?`nmu?Z z1uU*>`)BvWL+53`bz|$1unLc-dV7EG^QJ!SBYH2z*$O?RpG{GlhA@3}m3+bX{@A%V zdY!uRV5RcDS$2}bqUFt{H1FyI-5KDdH}t)_g^TVMh&If%EzxK{c(dt)yLpfLCyVCA z`sG;{4Ux*q)Q_5eA=U0T>NPJ#LO#z{u9Yu9rk?u7j}6$~xKw<6y?1x)gyBDV{|kU` zFk^ADVBFtIsf=o7OXFB$4jx~BrZQ&=8G3UU2b;0gE^U5&cAUL(upjf}lcSc7a#7g_Er5U{n`91Dn= z|M{iJ{B-T#Riq-z?~2y@9bUZU-x~P^F1aEAGlB*S2DABD;I zOvVo4GE5V#T7aT)fN){Mha$l+k5E zp1KXs80$tHTZbX^INAP$cmIh5f{n@H2rM|1#J6h1UGI6vlK@)n^Gt`iC0kX;%E6J9 zhS?WM90Bxrdgj%GfUvO^Iz-GX1fucD!eRHT?r9F|X06|G11{bD2b??d&2zrzo%()hDew4<64QN z`$FsRwlm$(sKhyFzU!gx$}-hmRl0yrKK@UOmNAj-tx=3{0Xt(BDE-@6Mb8{<0h3_* z^O(PxuO4Ovjhwz@8KH{^Kw0H;O!`PMspOv{_K9AE!uYHIA~g5{BRMEYu~6=ysQ(~B z#3VTE$vxN#3?M}kpl&4($A`PMTeJ-Q*B%#VjtCv6z~AmAJX9rmvBj7SU4GoTW=*%+KZmz4y-$pZv*g8Wg>KroRHX%B7*&2` z4^T|^uIqe_{Se`x@K~BK0ky=X$WJHnJuM>#KUgsg5s#0Wc?bE_RH)O}oiq6uMuBhs zLHP0wErMaF+2$cMYL&Lf={Ze@%BG8N$F)uHqvHM+y%8v2&{6a+7|9s{w{fI19h z7-B<0eH*85HYtnmDpmXn&2&8@JcXd&mCTwob#eV?*e||oZ6@ml>3#KFl^r*Xv40d? z?#CU!;Vf(K(X^Gg6p7GGiMshCduf}@*FalX;IU}8Ba)e*3Z>9#+9Q=iv$&7u4Wi^Z zR%s^r78iWj-t1P{Mqd04jIC&od(C$~`u)cw9m)CO+E?+x4DINDXPUf1Q@}MRbT8`WOTDwx6+{-S%psaKiSK8d0MRibN zA5n%E>$iP<*_zT<%GH)Kpxe6*mXEc=e)1Mu0W3!@EfEZV9GS=|>WpWMe#+CzjIJ9^ z{`G`XK1#87-2ZW5n;{zpRUT}+w7J5f#rAy*R2MV>^Q?-VV^S8Gd^xIbAa zBnVS*Sz8}+LMyi-l||PC0^o*j_HvgzJ=-1r_oCuuAKqK$MCC|WLAHuP%9Ux6UBUog zZ^}!vD>PhAjfsj+sicvFkt6Nwmr^#_xtGLL1hQ$nK#r5Az}n&Kvf??p)&~VYP0IR+;%`x&3Aw#a-{4q7fvLkse=G(HBdq?I#tAF1Kmcwx3z9?O*l{DIXPV zV~TR`Jdr3f@a1jE6&}m2ngmI)Xf#7zZ5KQ)+MR-NCxE0(r=_uuh_D9Mg|_Lbp--f6 zzt1#mEFc`{ucF0+8_%z=mCKlsf8@y<9nAd72&(7yXtV#T_j#~+pPLQ$K}WH9EO#mV zNYa>ElB%SLz9X|%kxq4RY`jp;?~TN9RPiRCRxtQUiP0Z-=CTg>IA zUud2j&lp6ra6bo&X9tuVRLn>jI6QU2R7kO<6bz~8oQ3yV+vkAsCbJC&vWoKkSGAug|hlF9uIG=N3S^qEnkb)K85 znvE%R&(Xi>!R7;Atjl}-OuS|v&vDSt?5B@ysMrr?jHKzPb|?=Kt3Sl;hfAehi5hON z<9-PLx8$?P#H#sE!?yA7pPl81TlSHl@&k(|s2sSAES}&Zz!w+B?%B)bB`7ewwEp)G zIX7vhtM}QWiSMrldIkB&g+BN9|%Y%CywaQL`j5F3wxm+Rg zy6Ok3@5?0SMz1!dtelLA))QhU#oi#lzJ)N%rHHJKHQS{zn-4Kj(ZampF$4Z#5{Ec7 z6sxg5dQa}C$rG(?j8>5Sd$E|uLBo#DFuw9{n1;RVsZ3z!eZM9ub7FBbq<;7E?q6tr z_J-{Ba>aq&FH(Gxeqg!0Miz61sJVU$w-R10dAtwo6)StB(Lr`fIYQzw<504s;edX2 zblJl_T@&V`vaz%!rB?)F%(bH=iNu}*v%4i*>M!i0up7J#jm6u}0t~(}m@O8);r#*$ ztMqgL0V~BOsaIa_c`TQd#N`NR#T!vBOEFSQ$?KpT7~c)MbD2lj`mMN>1Zgj)`s*-c? zrwC~KaOmCb(SqvQye-HFZ%81+U-FAsOkFQYp1~CiBC3vlce9e-Dt|G!z3|M2d*;n_ zwMU6x8Ewh82fx;Z6bbHiIG>ICuBNKXC6Y(Ek)=YBPdOO-`&nnQSX1Bha_Wh?4Pt-6 zQdQ)eK2_i0oR%D)(bam%+mf8pj~#*f$;7M6@vAUnud3redsV5e=O;eHUuV&}QY?K! zfYLs~l)@MOIr*Lbub< ztAd1RF7oQF2XZGjX1#{^V(nbseOs*ZlE5l4wG<6>p=8};=*m z|3u?SdmoTD={@|<(>yuEZyBspob^gi-HnA)RWFriDzWRONsZ5a|j2eAkrg|cnW9p~e=3`7n z^CHay`kLm5L{?8pS26HqeE$5RBtHf*GniR`2~g zH@=))`(7-_w1j3peR%v`7Un~K@XE)|@vAGA3!zVUYfRnA(odrHFB!BQLuEoUwcux0 zV;0@l`Z{Su*-hF}1EX>i9O|LAyhTDSIq@mzWrksVavM`e(v~=v(EFz0mX{q|mNr1O zPrIF&;GpnHirJA^rWYFH@&*&2JX3+dO@ZC~2TWVth9fn32YV0eUW1_;abPOb(-oC? z>d&mZqnt#4HC}#D5GXgk_&eE7!SihNfKWi<&Z@;*Dl;ebi|_9Tj?W?IC*pS$HaK;I z5jwxUY1NW#m0c4Vc8W>8z|UEM^Op0S_Fov(N`(zg4aYbsF=)Vfta?6W*!zYaP`*JCK@}K+$&9wQO+X;w%8QHY z|DA1Y*vTkp>Kk|0$uTa1zYIrSy?A94c0KkG8x!2V#MulsIB7NJ^PRmJi;Gh#RC|{S z?hRzqb*6fZo5pTe18zA_*8fBV!E^)m+a(e;F&5J zr5GhQwP>6klUos!>2#E8fZLNITmT`Z&_(jGfZ*yeu#6b_o`z~gEs8j(9>h=e!RATs z9I|`{>$tQSzyFq;RmZGM`0T$QeaRt4(WF<>Hu=>gNEB%{RFD}Ie&?3&T{Wp^{6>~` zW4SNW>T^74K0QoVdh>YxiV}-ey5fWLDU1*CQU$uW#t{nh2n-2u)wNFF(oXK!n&@#Qq>Uo2{uh(NdpQM=w>4yyl)C_n?~*@h^$NT4f5ykT#=3Uzc+=qA4Tm z%St2rv>J}&U{=|9*7q&zIr6Ww@vd9*<#pSdqx=_Nr6xid9d*k0=5aAD^y*ov23cIY z5sd>tSe};M;fEht%Ip-?VTuK| zPTtqBLL(|#6E0LbwyOPox!T#XSe_;wU4U!}(IkH}nX~=7dWRx^vAS;Tqu$G7IiEL} zJlc)QCrAHq1!+A0#(T_N&-)Ru5k7`L*u2lfyJl%=9knkJqI$m9m6$Jj&LB#l`N6TP z{rBmQ&}g@R%0t;#(K*`8NsLwB)~h`|lla6XkH7UL#kxL~zDjaT5q!*o2vSKy?Y;c# zdUto9qBZ(Dh16|+E!>%m(y@-b+A`_<*glcq@T-LT*~`EBYyXlNo&{d7UAZ4F1g-g4 z95mN9OfY^MC84#;3P@pb2FxYPiWA47ix7)WWMFLnkZ+aY88qK?Lq>=1Jg@5Qb# z8?M8hW*~5>g3YDB>4$TrCvVr74grjq#+7AOoO zjR-;|)5FMt8m<71Ehy`34;FyMp%66$he5HN!;ml(00Y7IU0`xD&%jC;m8#|RE@c6n z1CW#yIAY64S?@%Z|Q?2En&tPJ1Etmc^sZWWuIuc;r zR@DarP{KfyOnCR=jr(=MiGUQ;IwK4e=^UP{kr);fj=;kZ0foXBxiEA%Qz!(^*i`r3 z3l*{ey5O0H%vUO}?jmL$eGA%HG%q4dX**2ytIlv=2(hkInP*GHDZrp1NEKL*Dvzdv z5`1!qb}VTc+3SoWpTl#Ak3vekpE^KddXaR)P!(HSO|y zk8`F;R0Goe%KVM}L)j8V$yb@a!di`WM&px6VMB9c@tOdby}I(JG#S%F7((tPWte@P zT2^Yb-&J_?*mc}>S#{gKSNi_rUC=%5 z=k7eANP|&1%FldVf_->`#?Fehkdx3Pj z++Hkgrc54W<1a=Igt44`w@LvlE7~VQszb$r`r!`l@U5J+Gp@ja?6ZWjZ^)jMS;uHB znV!)%bg$GLu!(=XGg}qQ-VGp<&-K*8m-KV?P?(&f+xzpvtJ<{Ns28=p_Fh>y9Jw#S z1wgXUK(|#$l*)C&!zMR=7$1z2X=GG-B|!&2{b8Q|IN4}G!J0MFNB$x7Usrq0Dvx+* z6+izah04y__YIzfuD`eE_3=}oM5nX14q}xr(N)D^7u}-KF~2gom?QdwUuSU*ly3PI zKHR}YA$qEo*Q1kPmua=g23vpCu^P|vq*K`G_Ls=erV_}LC2~o@d=l9V{_uO3b9;4z;R^_I>V-tjcQRIuPXKH}pS(J)YdPf}|^ z``dMw_AoulSZd||nV$2jFv`qQOKVT>{o#>Qq!z8Ae!PmBZrBa;1TubW!oz-|=1C~gS=IA$mKko#Tr)=S;eQV_yGf{+8ZAXt;t*z|Yv$2D;mAp!G5 z=WtI>f=Ev`a!r~(WKSd=#3AYh(DlPF@2ot&9=fb6#7hI`IA~Ag^D(bf$Yj$5L@04_ z)I1te9$igesXeo5vyJ_4lpF3)G5`I;?})2=PH}7h=Bpd3;LyCEkXUK7E0&ZsNOVv! zjP@Rm31uT8uJb&HapU0M4=m*LNk5-mO6-2ysS=9g-}*uAdaj>o{Kq;PY66P_evgXY z!mtt2Bx4}J=%c=B_IQ@C;_i5MfLyQ4qf!8q+4$Yzo$Tv2vlmHwd9Oa!`k2QnBK`fX z>9I<&)&s36fTj3h2yE;=VjTe76-xxvO@xEp6V@ttUzMoE?Mq&pKO7$ zPq1u2Oc(CFZKo65YVb?3w#%5>8yuT6yV+nz$ag7wh`Dlg*sgu`4Tsx^CNeD#{l?zO zesE|2d87qx)wEXSL87*ix(D8}Qf_L=HposXN*W$Inxg{-cS^cND%y-b%mF)1GGFGw z+wk-{-C#`u*b4v#!H5H31%&CeV#A2vlg7z9Pp7}p+tvAMGVDH8QCOWTY8+{@9uU`U zb!pH?SIOJw{&z|D{~Q=F`c#J!>WMi|!W%C}*yc_Y<0nQ~+zU%#_lXa)q{!US|9hnz$v7e*?l)Vu`%=$$4W34KYqB@62rZ$#v)fMW^~>|Az# zN^)GLV0xwQG>uFiU-QG)PX-Y*u^gJPd%WHD*rI}RJVIvY5hb|gesh;fucQ`5Tmb3E zBZaE%kUZ;Co$PIph`}SxEaYHcN`ts!VW4QmsYDi5E@Kw4WMfiOCb9zHv>gD31oebZ z{+`5bFwA#% z?c^1k1WobdJ*q~nW)B=@nDjBcZI6Dgmfdc-f3Gyj#X^C^WSQ`t60!YyG9iPYH*N#J zo+8M-3;l6*QLS`-HMT@rJi4|tj#czdEIH`ia|6?=gaqQ?s^3}TI=H~Bl^_qX|gnyoKQIX|m-48333{f;V&)fbNL8lH}aYJG!zT&~+=fY$FL z>&v=ch2HhIXlb2(9!#&XaH|Ry{`T+d1TSnfZz#GG`o18WSVaBN8xjjRn* z6`38ZZF8H8O8m>+mFYhVA&VdSrem(StWo-2T{u-Dam1c+)@7@^jgFMG=o~sl*OZ6x z?jP;*Dw7Xffi>J5chZ_4h*3YMn#iL8{-Xq4xW6DwXM2r3uRS5>^v^FBVE@HHDLl6?jqbTdFgRmFxWu8TVYIV9EsJHD-#Gf1+ahFN$GIfN)G2 zqRBuWPf1|o2|7$yW<6TUl{=apmJ96r+5K5=A88>>|JK926?otpomc#v0j21&v+IzN1^cp~lxXBS`PhoNgT^Y(dOVSzHoa zB)HThcULpUp_g;e4u%WHao{AJYIWbuP%EKy*Vfj3{#7A<>Wg<-fy%BVQ8(1Qh#Z<3 z3gLm_L`C4pM-brHgd_Uv*?Qmzh&pUvkcI*dlOo5s;bOd*Ffc+`AgkfrU*}49eFfda zv$9x{yf;Gmp~a$YE*6qL1o)2ZY4xqI@}53^_x_d!{;3^@s1y%|8^njkFrXnYvejqh8bJY6I{-Sk@y!c!DNNYizbU{dQCb|nD5-uF2rE%w@B8KZXSzDc(Sxa;Z#sTg*dt2x zIInnTAI^O*w3w=$G3O_A0fG2S4G056K?t4ha;z~>H&_)MkJk<5!-BG2_A>^bUJSLu zW++-qb4LtG>>*Re)yLAaOx@2lQ0wY&B3uRFPZH&G033n~&ny)JnRtP_Cyf_$(6EI5FTut^yV0YDNJ z2*m(UCMYNi42q%&gA>D`fcmg|;c^*lD1r@<*`8*&bH&q5GGIo%#P;`vL@dkE$6vGa zUHtQHp|Z9~tj*=S_v%W?DMIh@6+n|>LPkW5SaMK!6RtjQPqxtcqb&kpkccui7D-C0 zP$2DF$j?jRr+Ny2ZJ^LRB!ZFrmuNTFxj@PSFHgD%N-m@H+utRZ;toNGDAOG1MPw_+ zP<^l_(e>`JX|s6>6cQeGGg!iAOSk?t_|5*~?JD@RoN;~PCT?%~e|mZJ#BHMc$X%xY zHeKE4HfDc$dQN)WOAOwBZO-#JKJBimLphdJ^^@_G=P%lX$9msgT+#WiH1{K3wRq3Q zSB~O0eqt9P<6a9&t1AYIt`o}vcZ_$u&q?SO@XBtqAX4c?Cm%TY=E4qeT3zyw?c(l6q4t)4y&ExJx8Bx^^JVfJ-&j2IX_=iKa?}iW4%L5F%Lu?nz8H`o zzOto|ogTN6%v%kX=^+csGMBxJySLAEDGCYVNS{FLxjpXwNCvdTe|9-;)Z^=o5jnD1 z^0ocPpr7Ao#kbH=?44`K@uHAb4CSwTl~-9Kc^y6nyMFl;aH3uzY8T&fSzwKo-6pX14Y90FCh09? zrtHo8UU|j69&8ryI`4Yd>Z`>Gm;$`~=b!n_saL@M%Puv1aCEw}f@X9Lr!TW$koV+< z|35GY4;yD-h%u(`dIUs9B0 z;eY_GUmL~Gl`Adpw+*+P@n~|z?~hG}NmJXO&Q){uG0PS_D(*JXT2T!U@gn)p+AiVT_TN;2nf zJ1!-aWa^^y>D5D6tfz!O2kM+QmmdG}vbPcJ<$_QJ#t8h8m~2?p5zG7vw@X_PaTfE7 zHcAZAHvxVy>wEow0Pp?Plfln$oOaGa`vx%-S+; zaC32$0Jc~hP)51`+#3Z?q%#|f%QTHP)s(_PBh`K*0=ph}|CZf5qRJ?fA3ym1MKpxC z;q@rFP}cI7A;goezu%w67>%?U+D>)6HFS4ba{PY&??m{j{;S(5ZZ1w+vN<}BU-XBr zRjG0WuqE6n zEhL#r+qYesPVzTt-zrtUC5IEX#?LO^DAC|3X52)b!bvTbx^+v2@cmb?$fLHb@4^fP z@JZ#M8}4G$W6c_aqx}3A_`~nASi9||#y;--5tRDbe;S)aZ@kF(8FiZa{B+&yN!@;2 zD~WjaP>$ypCV9`d8$Dqu65G|Ty3IVy6QuRc`CdNmV*OQ@pA8;-Ka_T(YU}J}grR!G zSS*TMO7AS}2!;m5HHaub3s-%;RsVAFgqW8+xNwE1ZGt3kHGTHDrYWqtZ7%QmbhDz- zvn~zQW@;Vlu`l00(nQdPnm+eB#`$%o5=AaR*-ul$e;Pc{dq5_pM0sAb+1g1j>o^<0 zBVl?PU0pbw<6R~8F(yyAdB3jQ00kq$F2*(9jHCrZb(Ts>pHkZ{=ll@)dLkT0bc37c zo=_;8Ld|HD|Gm6=G+9+r5T&SM`#V`Obt!U#DZqBqRDY{4A};BB{K!`F&9<9(vsFe# zkkj1=!NKPU0|T-(M!XchtDdsq*?A0b4k{(JqJX1g4fcQ_GMPP$A5$bTdlk0<;^A`$Y*MG48XoA#@JXr27WzckLFjD&Cgv? zN)io6Ru(0z4if78)Xzd*zHJ~?Y7SlUj#=gLueb9#wl2r>K6{#T6(gEs6PJ>Tql+Jw zLqj=BSDxbR{`t1b4XUG~AfvuIC`%;RA*7sAHLmMly>eQydH!E^EoQOQ=L8~Dfoqmd zO~YsYx^Aa2>Su?Fz^A6*dIJ*1N9=5gB?g=#PmdRVn2-_WR1?YPl=~aW$phafvk(i= zkf?xXw-utpgO|M~S*(K)O-;YoccdPCUzSj$YKPrxets769?Far_^+e=zjqeb&hpz* zOYrq^wmz@{8Cb%)%*f6(=K!6jmKOq`MY_??tQ*XNAP^oAQ25%QY4Ca=jja>7a=-6IjHEuqB-+OW4#YL(|GBd&-}H;VN`FMwxH?&w z?4)dHsgT64h&2hK#( z&Ou7}REV6;LqaD?b6OGAgHzfoikbB~H+qZjr{M4cK(IWVCyd4NU)sBiQ7f-1{b%|! zQIAinsq}IXTq(VdhEUC5`pE9Jy*Ra#G2MuWp~9Y_kg>0~yhy48DsK+Q$$Ts_pSHGaQme87_^;=cN~ z1|5$|Rv87op+`Ee6GR=tfqj1Ac)XIrdQc>(`^$u(7?)}hLDDeHq7QC@lbOxKv>j0&sc!m@xr!qk2M{aHS!-ey%?7TK^Jfq#YtlQVnMX+s2Qy?QH($UqPI% zvNl+?@?{Z`5Qy3<6fp504LqE>)kmSgK{GmLT7L0McVC?82F_ZzVi2M5WP`lKy)I2) z|GGp7RUpH%N^)J4xKWq0nL4W^N%_2bW)Iwco9GPx?&I9klR zay5hI(}!}9k=0XmcRC5ra}us6oecO~dHPsfW6LC8up8=FJkd6gsPd@iZUK$=7^MwTk(Fw?^uMFHVRMWtgb5iTUi$ zW?$kTCvkK4Ep(>=QbG1cKduiSJ$dpwhW~?C7C_?d2R(CQRJM8mHs7CzGd&gwAxrx= zN<^_q=OH?%8Lh2rb}&0NZzT}j%16n&eeKE@7FMp<(NGmZ3D-Yq?65x!`d$+v_Zw4& z{-&~3i1c-e(H!}esyOZIIlq_{uGXuo)Jf_?#O6%v#-={5Nb~4=#q8=H3{xuUdu+U? z?{>#k(r)-Hf9&sV)Apyop-K5y`TWaOV~sDKdm1>kl(2PuXkdzqax+=Y{)KLIvmt*~ zRA6okbZ2k5yJlQukR*ln(LzzsRu)?YG<8DRMM;<;CTtjEvGWFy3ume~93V*+`9Ejm zKp?`U^xyECpwZJyiJY3s{E63#19iOV+MI|y$gWFWsw4OVsUzt~xkw_qku%do0lRU8`-wT ztLy~B*B_ar=Zr;YW9vmmbz-dUZNC3judwuHsqD<;Qp+IC=E0j>vVWEHn+N}C;s{w` zG;XnEYis1mS<(bUDm)gLt;v|7AO$UQ(P_0zdxElw(h6Tsy}z1rmGFS65~^G8k^&?a z)a3GaOj$h8}FLI8pB??*8EuB@1CEwb~;KS1sFG zzqK!P@gyEnVA8(V2Lj5NeIa~I$Kq;s2zv9kPP62Qx<9Ngek!a_Z@8=tcWg%>TQG1pp344~m&B{}2R!YPx&akNcEt0i4Jq1lQ-k}gxPva3 z#?~ij=T#rRqD?BAF+rQ=UjHM3Nw<5b7WGdm1r^B8L}!;4XxK%EqsNTvFCFbY3C zn^a&d5-XDXd2IFVTUC?hEEi^d7UmzmaSb_~RQ5tbHqLTIeU8cT5d`$3Yjwb(@Ca=J zaC(PMMw?{1(laK#a9KBXak=pLmTD=Uf-mZ6pNPaVYw?7E4pqzx`kWGjq7YSMYX`Xr z8k|T?8PZv>j6H*JIB6D$=1s;aVfMN{lMX3U!v=t+qi|?|?~&X$NigA)J+wf>d}5cU z`-CvG<5JE|Q6N%uJ!vKO(Z%onuZFiZ<^G>+IK3a4YF!Fai~;G~#&hR97XNnWvke4g zG(~a4%5{p1jBG%N0$gvKSos3c1vaYaVL&&aJ9C&S-9Wz@!o z-_n80dGSA^j?{yte3KR)KLsVFFrA)xGz#dKOI4u9h z*eHfa6W;s*v_lhq;@GW!O;X>mmDNn)dG`JgPgLlCBwYhTC0rA}*=^QV8*bLkZnJIM zwry{AZ8kRBwrjI(ZM*TS_xlGkXU>_KGk6|-fb+iT7yo;GLF7Xi!IYJCc3d%kBJhCA zpnHQ#WO8yra)VMvQ$&CP$CJ>^KH#COg&&*^3)#SuW{5s-M*oAUtTz}``o&8z9sdh? z3B6~HdW4(pS0+vfArb>zCVluI;%cno1E(@!zPLy&zovBGHP7jXL>al zDH0Uv{VnF{e8q&+ab~IxXj zgS#V!l0nc(zyH|QnS@f$WZCw0h*-|4q>GN7CdB><`)yUpfw;2FO*a*8WIluAeP_)b zkEB2jSj}(ag@la%)uS(jf{M!2@+Jk6uJ64FAcAmOtydwK~WNbednH~OvPJdBO8IKGc0Kw?r+(drwNrrMiPus&H{3+7bbWUCEZ)F6Brb5Z$H?PoGvQNyM*4y45>umvj>iw;T}r8?xQT)4l4AO{EL&`gZE8)p?P$?a8K{= z>vN7FqG((^YOBhrVM@|k6n-i}FaG!aEKAMHuA%&2LO|$>T>gGY#*0O`4?O&jI7shw znxQ+7az7K7NXf{vIU8St_ygX)rA*b^GF-0N>X#;+=#40JK(ptpn%9=}VhXWf-Jh=r z)XnPsXl+&|L(sa6nNl`xAti|&9FMW_^!+oz*urc#`{}I#8s)>AGzX8Tk*4_6{3(;s z@x<-QEEX{#us~|VZE};^=VG!jI0q;TNgf4{Iz-u%_cu8l`S2vtOzua+NXmkhnzK;t zStgk$J=8k~c6dsqoF?KhM=Wescs!ole$uJ|0b}EKH-q*=Mza%u0S${sH`JeHO5fyd z#!tPD*_;k@-sWcseDMU3R`@v_;Jq(Yu#k16qFm(#^wrwkveEj=gzR7`gCj5*NUxy& zJgF+9V(nAA@2*ap!+K~))Kgi=sVqIlELko~EBjbCPfNP-C?TZXuVR_RPhPHRoOSS2 zvHtXI`U4Una1rmfUGdQRMvC(*H#d};Jb6SbId`c64?Z~(k#NQOBs@SLS0v;o|I(;^-<|{0FX7l= z+);Lv^4NF0PrX(cGV3Y`$7ez6e9eE$kL?@n~_^wOPQrmztQNapdUNsxmgb)dh5`c54$bW=of>){uZgrc|Z_xHWM z3@P7)GhLE)JM#2XhL1_xvnH$ZP`l&(nyl;X>%5TLuv7wLrfMqoDlifG&2P(eZ!a|c z7RAOlP$KxCa>Feysd#e?(ABVd8c-Q4w2(=xDrIFXHx-eD^*@>ExQ9|c@G2PxZE5Gx zU>W$2Mm3twgGl_=f)O#vFoWzECA|>!h9F8k;PJDHo)i|C*SJO71*4S1=s&EXp zKtxJ)lyawPDM11kcmhn2tyfl3B-@W|n`5pWv#1Sd%`2ZSF4UeCrw*cOOE$}#u+^o_ zlQ2_~NnA-bAsX{~=q|Hzs3NP9E%6B@?E=-KoIVGfb~m-KgGilKAoh`}u-MjBVw!lZJ-54QP9?m?-L z^+>(_Oo&j4c#IGqPCuAK-7Kc!S{s2u9oiz$sA~!W9b^Jv2Ge%}g+V#N=re(mkkAo9 zo1mW%T8zIxWA5sI7$zSxCn5jTXhrsp|Ad}CEW6^htCue!VuMNds|!j4drRqIhG6_t z^zp$11ELKjS)d_$Gbw#62+XV;S7_f0Dez{V~j)O+$?I8P?X$jeOM^>y%e?6OC?VQ-nJD3;?$MgFrv^_k*`0O z6r(o4!D<62=}-bz0_+jf5CcK{-_lS5W`j9+alSy}`up>y5d#DQTd2}t0>J{)7%;$m zuZNM{csH_#c`uh2It_=u8D%eDRLk~Xk9TW4IZ|K&T_XGi#Cur~rNMhxdhy{WU;;3D zUI0+wS$#9#5E=XI2b77TN%?;d@L*5yLr(O$Lq(H|b_^ZUMzaqY_jM#iSBU#pCR-9U zCaJ0r)=dH%!9;@xM(uQoan#HI+(W868Of!OuxbfGX`WOxv?vQ=H*@?4l7ig6jxJizt3yY{^G_s@rmY5ZWkJA};WyrYfNZqCppY;3n|tahrf z!|Z&l`SJc(QcNXn`S&6C=75Ix;X&D_&dox59N>o#L_uY>ds}BbH!gYfjx-;_)T+On~Wn}r?Gm9GfHve z9~d<4v^-!(ukCF?+Y3vDnp4k;&=`|iLEvj(ezU04Tx;7^e2mH&Bv6W;VRfc=wLEJ{ zV0G$k$j5(H3K!hBn`;n*qF96d{=U{WRj-&cL_LSN;oVt|!eTf@QwpdOs82sW%=hOe3QU|?NZ;nAZyk8(A;sS2H z(oCdZo`7LMAe=ukkGrptl=2I6XE#R6LB}MCelz7-LSY^wheGLM#BO!h$N9(lmX}L@ z6_Qc^Q+U|COZ;;aN0Z)SXs~vJ;|8*(Rseswpg{2lwzN1s6vYLA307Z(XRD#<>h@oc zwXQ&ZMbC8cQ>j~A>$SSR5(1v7C($X2(nS$8)|b9h^DtDw5(7Ydu?VezdwyVFF>U&RB2nR3_IV|!vMi$2BSj!GGhESg}uhZ<6 zS9kR(2>T{eJ2%DS0wV`A%>KQF61S0es#dbI2`lnp+hh$WFgDmze9ek-rraO!l#g9t zYJ=2Qvl+y^y&r5c_dfXG_b6X<4@5e7;v~xSsqk5%j&Q``8OX16IRC?VKsULPtvt@GJnT%MB~2F zb-!_JeYWZP5E-JlX@ECo=!tyx=x6)>^FxEn;qP`&_Nmb;G{%~$;lK80p}g=!@Rl1n zRAgZ7BWipUo47#$cvF+zKMv1^fO6RIUlF9-3llz6tIm?hO-JEg zgAU8!4c?x*q0yDvcirdJjcO05GM~0Mr0svJ<< zEjsidbq^PUt^&J9>$%iZ_NXcG!TXC-o8L=+f3#6p_w+G4&?srH%6r&efwa`59u{cc%7rqYxr%aM+Aq9}7&({$BRFmh1= zHfQ|)pn?Et+)3BBSI@!Wp6AmTEj(ff{gFt;^1+uHy}_xOdK-Bs3>E_t1EVUOU5oD@ z8hD0jmdcM-sn?T#ue5@3m5G!{YYcds>)|%1C2xSK%_7Oj*DY&Ai zsdEUT^xg>TbGY~99|}B}gW(>MHe~b?$W!e^PgPRL%59Wr#T-<3g?S^N7T6UIdt~0uzxXs7}G^{Yd!-7;?aTNov7TZii3XSsHM-AiyFNJaf ztJvFpKFy@enei$`ronNPV>f)Q2Js*F=)P8RFU}kzm>~Q9W=K8#f^eAbLp-w`_eKps zD6R3)MrM(VqSE!U=zP}bJME-0LQA}*KT;4l#GmewBPEQkUXrVNkHc<#mb0_F3f8vqv%f>m(}&}lm;fa8R#mgm& zWFFdG2NFKyC__A%j>p%TAA5fvm|()uSS5s>_Ae`GNkdzYuOl9GGX0LUbelJ~eS`0JL|$u}Lx+bPjH5MiH?CKWS9ArL~UsS&{#XbVkA zbER1YDAbO!TGy9Uj4%4BabuU1gu6Mu6uBB=X1B!Bc_ehw79=Ict~Jvp^nd1B2j2&< znhEu9!cjKV;H1*BG@gC;{t%A>@0q3FM~^PVN-p%;es)%jq!UA==Yq5qk>mMYtuZ2h zOk5Iv(fMJ3P1+q9#%_iNn=-A86M~tuNT{13Jx| zav~1h@Zhy>rDF*%vekKF*o#t*Xs^N{>^*Vya(LEIaQ5_KA9!&eyn&q7ST#8MTh{X| z?L=AgbP9WU^r(8YY@KuMU2EkNQSZS9;*E;%1c_NV=!l#2e0>8HJ8j8_M^MeS;tr_K zGJc{iD9?6jSpmuQAu3_Cv(EOxd0;{d-y?t-!kCH?#az&m^9q&~oP%a5gs}Qcq^rx8 zrz%QZ*fsATyVCtbXf{d1mRZs=DpKyj(=s2_?Tps6JWEvnqV7FHOKQS2Ukf&|EEM1E z7FCO?y)wz#nX|F8p!zh;@gaEPMFbUg%Tp`U-7VDkg5=-qI|0Xy{_Doxq~(G;lGX9c zE4W~W?PEZP76=1}0LmE~o|+2Vj-9}PpdM(6|Ngm~RLaQRc!o$m$|@1a+A3DvR!LYE zZ29!KOAm8PLW%J7>8FJ0DwcRVY5E_mp_(o@j*%N8%epKo8kq-;Yz(0f5yeRrs>#J& zbRL?kZ))#Ltu-n&8CU9!$vPQU>JL9wsz+HWfOxT#$|@M6mW-O|_EDMDS!kh#sK(d- z^mE+PyrCiku_O^3du~_N(drO?e371BqW7?p(9NmfO)*U!+o9-+>c|Sz&^;@^z^wSN zm-@gK1^UxZ97B1JNRyJBntrq^ulBN|7X5t@7L4e+kPLcX7(d5u?AwaIKjM)Ck&$C# zbBks7PMJ465{yLOMzRt;qm%P`Nh!$z0g7>R-sL&ZEK0-{2f@!UrmU&L>{4<(aD zG}r{aGs2201vYC;%NJg#THZ4yW=`#Cp0ph3Zc@f6EY3lLd~%3FWC7C02ZkZ8B16<8 zm23+)tk;U?BX2IYs1t_f+a`)*5?9`D=d~M|NDGsRcrN!-^^Ttu{6G?n#ydf&r6(gvjy}5wBFl}6{2-j4V^~pAwEvrU4o!~s7$pphh zJ+5IBGGM#pnvNW1o`4gTbFG>{h8eQ6(%LgQW*ECU+#@u<(7cxazzI7#`msrtcZ*NLSs7y%a{ z5h^3)62#1($p5^sOr}7oPD5o>?yXtrAjT*gm@t6Ai9}hZGZDJHB-*f6*`OonWQA?1 zvf;i7A-5phC_SySexZbwpkn>H&JkVfSv}7qBE0A}EZQivSx}|06w0DnwLy!YZZ`z% z)t_0tdP_#spnbgRepZi0ib&MQjSLgWg;$SWWl@9(5gEusYaj}D|ArhwBDvWlJO{2U zlNnF)-`vFvg;KFbm#4gs<3#f4C7;cz>O>=}kuWsMZ9%lj!I;nWN?NqijRw4}0Dfq` z!Uo<1CE5xFqVrX+;yt7Md^y^W;z-NJEEYyw?qWN&!l&%F()ej^EtBv^%q?`CW+5wX z23Oi~PD)}j$K08hfuuwR0itGBag-}$gfZwvsK1#_D=y2TzU>hNa2yK*>C`Smk$Kw8 zTm=hJ{VMmO(yQobYbZ&VYtse##;2j#Q4B1f=e$Ctv}vd-33JkcLL%9lv*}fV_-wBj z_ad>t*|o-XP=mR$+rk#L75nz~*n8)eF(IlfrSTx&@3Vn0RFU5Hhi)H7h|>HQ)0B_cn5*PLn`5TVD)(6K2W`a zS_aicNI(RND)^jAIw71;$bP(Wt}}Q%lN7cBm17X%NnDhdNocM_qGPRuz7?6F88o^fchADVF4(xpXVJeK zl^YR6iq*s@GjzL?Y426lcL{JUEQ2vvtv+pWKzqBSt7|lG(Dgb_)VcaT@%OdZ#ExFI zK9yQ;rt~k3-dFg-it4%gKgkmX%y(m%>bs%%p3&y9%553#)fO6(YAC`mD8cwMjcIc>wYpKd&H2BFTE&u4F-6VP1!z5%cO?UA#&y$LpbEGvK3Hi@lh*z@r;6jJ205EJ zDi$*5|GLE#3YWKd9vmwoh*&b%6S|@w6d9IgasGzy6iI9UP;xOPCB+v02WqK${!`}P z&WOGh=22_GU=`!Z;jH!X8=TRZizL_PQRC-Cw4@r zg~iZ8>n@Vn{8GtmUPz;ZBfuB_@0=+IK5%IaVSW{O3ZC6sIb(!1QgCXAHL{|ZALZ%S z^{RlVTv!&X?5nVGugiz}${+-=hi5wm)-vn3FCQ^x9sA=qpOGS8l=rKj2j-Wpl>8PZ z-2a#Y#dMBz_ehz}%*VS_3sm?;*KW_Omj>j%h;!L%BwGqp=}O9RYaA#JMMx~M0T&$< zT*h{#e(AvJ3A^6Il`H+Z1|dzcG)U#gY64AEsnA3aa~T`iN@#4h4nAYb35ArQsjs1Y zCnn!4ZN@~`VCJ1qo}fz%u)Q?r!e$M3=?7CbJ{>H`T;9i_H|drsF~{BXl)c6F)>8PE z&C=D%G^Ev?yTq%Sw*+SzTJqeLS$rKqCntUSEu3L6s~mPcV_BzZOH5r5T)-PeSB21j zCM#4qI2FAj#{p879%Q3!fNyi>77#Krt;1RWd4^LIXQ7(92Jtf0^crxU+E(T5Ru39r z3BN1McI&NpkuVnz>uK5bd^89ZLaWVR$qAMP4U3kXOT(swDr_cyo$1(lk4<)pDy=Uy z7w0p|FZ_Elnz}z-G@2r-tug$$$lQ8MP|B6w`-eC6@mJYlJz1xpGa6%Ys>IM=T~qsi zvd{f}*Aap%mH;vQd7p=-p|namV?p6;Vr@V5h?03sS#?c*Yu_)_l^ebK=S}DrUGg8e zVd?{m)Je!6cXhbdmlP%>3bwp}R(<9{2a%9o+$)Q76wfk2Jr@fP1LXgdt1QAg+!rOh za#{<#Q-a_Qh$e>0Hxa|Qjb%xXMe&x3O_fMMF-SHRH&-qvFS@we3wB>_VX{m)+r8ch z&zzh@XD+yU{m3ONs*22wCLS5xq>X5rtI)uc=gyb6rV(4nL{H<6M|qSv@S(wbXQ`euH1(wU!PvC@<(Xc~TPuiRkcp)*jY1*J zVVni*0`>K4o;}M?s#ED4gaWUKk82OkQBq=PYs1^<9gkh4b_Cb)*3d!0>MB1Ck*p<) z+crXQRU)>}c&s{A;?mwII*`bns0ymvH~3$uM`9jBQj1=&rq-KWJ^4)`{fF!bvIA&^ zY>)|jk2c}7)RXvvdWRj8SVGOs6SSO{>5N9vL4i3*j^B!NA|*Ad=$#ZlFjdk9W>jgh zH2pG10UdCPhP=!{nQridg_8X&id*%r8d72yWfB$a*Gss6E?sIAPZR8ToDLt#fAN)2 z5h9IUA9No?K5RYG4aX{UZyns5pf~RAS6V4LPea)Lc;(M{8(zSEJT_zA`glR8k_*EG^G+2 zF=!+)9?F}(N8Htv6l?+|L7uu=MLQ3oC%Xjz(jw7_DR;yE07ylo>=-lv6P`t)FC@EV z==JRF!J;zT5PX7<<^M$ljZvkfRx?(*m9r>GZByFw*8`;cGC&6Y<55W&RRsP++w7K~Sf+C(%4ivWS#$gp z5Pf+qzHHzmuK+<>ph%lSV;Ow5;_$*Rb4B)o(_bBvxc_{qNKuQC|5hTIXMM$bTzp>C z$o!VguvbRimrFn!ObH-@f}dO|zIp|oS^qGC7!aHRH^d4THhbD~%WlKVD5;>d-pQBV zQ6jX*!WEStbpIZ543d-4yC<&EIt8QPdPV6 zfh`%bxD>AV{QX)EaV!f|4A3a5wJ%YEohIF2d(0qy7>Q2$7icatE0%H>J!0i=h(*gg zM@XJ3#Rb+-O5?uw2gk*GXPbky3KDDF0Y{!%cv$tnx3{zRoYRu{M89vSyjY9&I!HJi zM3&o&;Qm?f-iA-Mdiyj_!&R20Eg8T2yV#ih-tE$Ncx{)z>;OzNjWV6K-v3?j#X52W=i-wEIWfx% zeqU5|S`NnJ0^HEr{3)4|sZC^LzC%`FD|-D`^Q13$E7qPz+_=By`$nTYT@Rj!eypZm zT_vPFwmDP=Q8TNSCr%nkUWhajhNRM#R~H?f$ItIh4aQCj-N+j>j8}SdTE6Ipzu9%?0tMth?`q$0urdU!OcDe<39jtTpP!8=488N6;g%j4n4} zJW5S%i6=cXp*DJCew!f6)*jM6+={(;#_Gs{Fv9ASO+p?U$ZO;m!$FIb*Y>6AorozP z@%;AUM*f4Eg9$6CRXsiY+$KBpvz9W?D*aJM>W7Jo3|eG`4i%03S-E!R&^MvbON?lW zK!i7J5Uk8vDO>9#Ysm10vCP91AX>Rx&mM?TWYv!}nh)!2YzYDAPRU(7&t8; zfCc5pNOi{_Jic^}B1|rP(0h!JgwNb6+4J|oTLKqn$#mXN=${vVh8=!Qhb0Xx0-CVj zTA#_1`FT6e!2XA)cg~D~)lE1j7WatL`8T5`nvP6-0E47RT{_ab)(PaH-uK;7PSii0 zoi~hW!Dz0b9k5p59=>3i4$CIv*SW+T(}i<=ST@_9*FN9-DU~vKRo9&S&5fF)Lwl^shSa5--}^?#_>f@;>Ma%p zoE)3prreVF5K>))f+9d$Uw2?+$p6}T=O_-FQZO=!K2X9Kj*4)yyX*b&yBmi#^ZMfg znHKCES75Xd#&y<}-@p00h}JGks!XVD(ao}OQjx&GaTZS#Opc0e1=cznD)BLDA1O`J zMW`Tk_Apbm(3I4Q#E<2BMid*S2X^HVYGf)BmXE%xKh};m4w@j~Q|2?!0&?~@O>r<& zAdNg4xGrcopcgoTBMMusdWyE*)K!>*tIc(V>W8l56%DByM)jmyiXc`yD<4S&LKCNo zP9JvA@ah@E!MoAJxF17q-hOibHcKT|Ne7kIzB3#MNbSFwT9xGrD_OpUTbY!GE+lsuep* zmd0i8*+(7VP0C<>$!7KYQc2PB-e&4MM)JgvJZv?Ev8^)CG1I8h&KR3OP|5}-w5I>DGNtk5B03I1j|5*LFr7=OKuJ9{@ zm$0@RtCL6=6FzMJmvZEw^Nd)}zwdvRPv3V!iuW*qB`C;ErxIVpWGC28i4D#|533D- z=M_&lfqpqkG*)>pF2CDv)s#Qti(zApvtsi#fnaxwkJTmsxQm|}!HZGD{ zpq(Ox&sgxm3%u0z`hijZ5@67rmaMW<{qMJ|S>c5$VfNE2;@^?vKzDN??1tJa z?0mwo?)Pts$&*Sw_l0zEmOm^8CrVH-d>UQepZe=`+|a9H^&cD|iSa{7Y^>f)yTjMK zg5T^Cb=$qL9Z#%U-6Ap!1}mI!5!pO@V0Rci@Fc8<1cYKi-#CcNQ&Qycbn;OlLVYqT zUNEgP%$j z7DQx&WeW%mcb-6QhAj;$emLSCKxB7lf`wh6r*zA>XbmMi$~B-Y@Ig_8lriPq=?-3c zos_MJw_mMRMr$nKQCx^avl;f__9)yVP2^p{i`Wh(M5}EQQ0q};p_sjddNpri+QeG} zav+T@xqPdC&N?d)I8(-W?rswft?}+<2FFEHyGr_~w0`zwrg*WBqdjAk$)WDLOR?-L7XcGS4g_QPc3Z?De<1fglhxQmcVkOEe0L; zYf8LZs_C;Z83WblxrJ2I=~ESll}DS*D@u0hA;IU&4p+x!t${3;3Y_vnwL^J5N=haa?h0bRS7rx4Nzj5fgK8qd zf=}9o`)NRS?-ADHeaA=TW}-|N}>6X&HIYWJ6NoZ zBglR8P4tT?g$*EvD#aZGLP^982Sxd`ErWN73b4^Z&HnxKXg~F-& zL=TjtQ3NQWBqa7AWc>8wnEck5h#ZIg={3%&#`_=bxW{ zA}*mFSln~!E4;v-TRvfzXJ062qx9XD$R?DO?!IZq7U4zTD%3F#F-UpOk%~evl_M|< z1)KEYx0)&`oFPPyA3eo_ok#I3NEit?zJ|blpE)etzK;+APu483_6u?1{+T}Mx?ZL_ zU}wYu1f>x#sopUv?ij=IX$z zz%j^q5!IPa+o)(S_sjp1V0|^+GOVQ$)JWpWl>nc8Pl7Bt#8^6TKdQeh*s&CsWo5Qin>>@*K^1WY-$xCtxRF&SIQkK%t`>d% zyb|o_`gseUR+}!1tML#nM=F&f1(WvA4&9*5F=ZQ9;H=&*StHPRIWmwVguuG zN)|1qOIHMMQT+~!c!yTNlE_4x*REN}aw}Xb8d{=MRrwKrKXpIn5&r1Md42P)OO-1> zY6>sWY^}LN$d-xP#v)oEa-Q#;=ZV*kuAOZ`^dB*lN>}&-FvwA*7ODe!yZ4Ad;vwsZ zBsIF`TS86L_aIAV1aDK`Qu&DwB+OHdZ(( z1!vA*9qJwjKQ$Z1B2cK)iRYA)jiE3ChSokX65R(*`S!7zamLZjZ%|ptIt8^Z*XN{<=kFQG$_VXtsRk_)>%1FWspWfgh^n%uK~9a zn4pK?HZ{>Mp#bRv0L%&B0cglzkFB&okqQ_XP$WPy%hJ6Rr^P|tP&5CaI-o$+T1a_i zcZ5Xos7D8ZF9Pl-PNonf?ALSH5h#Tw1y=fmg32yyRu+n?&=65Gq|CPRG zxt7%TO2cJ|uV4PK`s>`tZCBuIRG+s%4h~4#I$z9fbXa2=5m@i-!$=QNCW)l|LJdhl zI13J_+(=Dh|IA5t7?|gB_~vA2p*JK@IMB#bb8~-~)uSV7hjUM_ss&hzP;*FSzlS}b z`h1H$6nbwa`b?|F4Ha({Ay}Uzx6B4k9{{s3)Llv8w>`V$x-tU zCR?Z}M+rAKZ1_v*nZnCRK}jm@BXI^InBUVfMWx_&Ac|1{IwiE8er`NUYh^^$-@dYR zQqD9d&KS@KV*N9<9Ncf~&piAJPXG)(HQZXNmGi^yCOrQ%;F<#xjC^uo}tn3=Sr}vAu+Nzu0?%pXl00ZmY%xGR!cLO~C@RI1$77HJLhh*JN`D`3| z1O7b#9maG@f}yFhIJi;V$0m6SP8@mfQJLy?aO77hyb>$=i}zOgh~{)LEh0>)yOgP6 zorJk~KKxF4_)<)N`%TYZ$BcqKX?_gcY2z1`Y7B-#A;{CEcwXz<9Xij%;Wu8U5gLT7 zXrj=&6)i0P%u_0G!yta_8zi@VLz_#$I1l1Z0Be~a2;_u@|A7Wew9bv(haKMi_V(`k z_T2$d+d7H^_Fo?4%JWM22A9 zb7(|Fmw=3WA~V=(GbsAG_d3^TAV!^j-j~F`)0;jtCT`mB9FO`CVXo)YZc=QY~=uURxR9&9&6@ zbDk_AP(n0ge?HVoKAez`h2bfr2_fd}i3XA7EuH2bd(P#IY2nmHVU0g4>w7&mJfYF1 zFuYj&5o)jT6qqo`@w+Aa>87YBk-~!3OdiQ^^f?>H2w@{8e@GRcX^<~$Dw zC!&_ZX#q+!bL{Tyg2zs?#@AMFQ;oaZ{dXtdc1J;nuoX!uA3=uoC^@wv4gO2Oj~C*f zkMAuc^~z}^yE9Dq1z<=WN>^6FRRxJ^+58KBj?Ir?DKJ9+YQr}nYJ0>$gtoYaF-7g%5_NoRd zA4d>J%1-(kMmA3|SC2-H%n5`Fph~< zCJB~_$_)G$raaK221+^4d0A48Dl1Ur6&6;)9}E}`*ApI}zGa)PzpmQ%W6!ssY%m?s zOJGRsLSa0Lj#W(IMa2|>w>1DoykcUtWL!=mE6jnfwl03YcHn7v*DP&agnOaFX%LM6 zuQ-uv4r*>x-o*K$i}?+WVMu`=yQ0d{RJ3($TE^M?ycH&!;p>@&rrxC`TYE+L+fJcs zt|qdnlI?D%U3R&Z)uln$7Dc%5kLoAyeCNE!wgwfVWFxcisI0fw=L(l_FdF|k$Ou+& zB_dR3`_)fd%+Mj!75k2(iX2%Ai`R`f^c!yj4AH*}4RfjFIEn%)o4U>Pzd`*`yQ@aV z#)5*__uU>Ll6MTOuR2$dq5UgC357< zDvuaVR{{7p5Q^&Z-__ZBs`I>f`F0SE)5hX<@S8YDez4h1_|eTx?#*P5@nC#%DT8~w zkj`@FZ%a6TUZ`pugpZx114J`L?H1Ob_VU`YM5s7z%HvBgHTGaA7gG0V;1>pnu6%~F zGJ)e73fy3CCw9p>5660010BR9=)wsehDy#J`mTQE4NkHat1# zeiZ4$`U9Zl2a{o;wqw6Xnuo%n=nQgb#5G^=!-3&axeXa~RyfU# zs`sqEb!gwI;j4?wQp8VWp=dnZIf5LePHZH&Pb-_TcK|}k_#CtrHz2ZJi5&ARXz*AZ za`N#Bi6%WTraS;3PrkQS)jr&~@~AtXNy^4b!1g^?3>*E)6 zfb)jtTtkD!QYOLD6*8E@pIChim*RGseCi0`Ot6atDK-UX_DEn(dt!wCfx0~vWg{k~ z(Y$yC)w}{(wBpt)bFe`_@uh78lflkjRR~hf%?b$cP;|4@fADUtH#oG zIX9Fmb~~1E_2r8rQ<=H*VowZIS%UaoNl@_fjg-y|totQZ?P*N2UsXbx(RGnNr*)1S z+ArA+CfjsFfdC4vQhhAhvHW9pAId^xkzxw*MkX9~x=)|5PV?0@mv0>9Bt;H+xvuyK znOmu^*QFq6FqWV@{%(e?`OmAH;rU{OD4q5R%TS6!Qnh>284U9Yc`R&|$#&62<%+Ah zjOER>%^j(2CVZt1OQmIDhqPtIBc9S~-6KWK^E)S%E$BW^th6F5CBn6kU{S=0v3qWP zeqhkKTd{e*|2C)?W>Fr>=xLciQvEs)-6URjn#EvTSn1>%qXEG1MUkt=i2XhrmA2Z; zeGf79Q|2Y=39Kx>d6bqf>t1>;?rrfIv?cOomi0w(D)8}EccPXVg7LOoCEx?Opa#zD z&CiCUbjF0K5&{7Y|=f8X4`8%$9>E5LlX>9+Xy!{=&W)NITz zvwOdB-rH@h>=~}xgc|BRKw>ZW{0)XoA>ndc>{SC|q; zu)qffM~K*`(VAs%4#2j_3j{EWKX(l!2s$p3HMu*a;Me0`)<(D2Z1VXu-talyXkW1p zvbxP_ijZ>o=yDJCjv|KJi~VqyYME{J+v9$Cd#d4V<{U+~+`gSPr{vLo#{v>T19;;L zV7{WzN}|mxlX1knvX{0i+U!QO;*rkk=bBWA3Xl_i`h?${Uo+AP=jY3QwQti-CDgv# zJ^lHiLWG*M)Zbh=tfjgVKS;T@qYYaLD)2D*5>x8zByQ5C>)y#e9CbK;QhW75!NiQ+ ztm>FZD7`h*An~4}{84Q7m{58ro#Xj+p<_^*7(-o{gv;2=kHi>=9RW3$w;qKSU@}sC z)z|;QlGtnXrS3dLAu=xF4Ya$J8t>*=9gjvN9;T3rv5cW1XMwr6^)jd}bLB$Md}x}v zlw>E!zT6?ExX-PF!1}{=3f4IP0{o0g_OB*($c%6fC8_&5!{(CE!s$skZ!oyCBu+9| z0L0?2Z?))hbU%P6(L?Y*@a=k`CdHROZMw_k^i-@nhhDp>ev)cDLJ9j7P$^cVx3ZTH z2{@Zt570^!C@e4oEvRNx)RWX`?rWH3`<>wqSx&$Vzt(}7e zn8b4S*Yc+D9c{U$s)h!}58HY_azm-{!bn3LKzW2LqoVQte`d%(XtCQTY$`QNL8oDu z>@_w9ow2$5t0`DhpDRo7ku|2yd!2oitlJCE!znjlC%~Aw%m=FmgB*McT&SNWQp0ksY+z64>=US^a z-U0-Ei#|;nqO43MKiHS)%J2qj9KijLqpNUe`un0A9RtP)M|aogmeH+rcb7;vGP=7) zN;gQSUnEC&hajnhgkS;64}YJ(;Jv-O^X`53+;arNI6rsxiqFjk_dlF{zUy^@QKZQd zNxdyN{><^^JNqV0kCfp6`YClA=0#@@IlMKz%YD)C@sIFiALj|)$P2+IEAt9C zLF~yN4ePzBE*@}H(EnqM&)cQ-&7`s~ub#xVUdd;yRO31rN4<_UUFgC$<;~SWg@OsU zKz(B4=PW{G6uxKeaTkYuTjzj<`?{41gWrvKn>$?is;fj;6T-yAzclK|J5P1yf)Xq$ z;+>~Ly2j&c0w=u^rs$&#RD+T~8W_0MahD2mFlHJ$7Zc}@6_Q09Tt^6q^bm`&t{QO2 zxWunlU`{d(a)LD}#BKCQUREXl#P}hMx%J9#g7+Ea*>8buTEt5aW~K&Xfra(uaWv5fH;U%!h-vp8Gh{M*;1m>yy8UsT3QxYcx#7n{wb!!DOK4rPSXbudmi zKAcD|pB`M(3BY`o2~W`>K?jOvJs{eT%h}@=I>37BlLG7zhf>gP#E`37DUfi9l!4~O z{#?75!n9L@9lJBRn$FMcPLzmouDo9BWK3 z^Lj1aWybn9??=SDF*kLFyT7Lr*(|RsEWCm`&C<&c9j9x8Y70a1TBX{A=Q7-HYi(Ah ze7S->+xTKxBg@$9%sjsTWNe;r`N$c%{PHJ>e{qiD2?^W?Sqwz8DoDtShJ8CW9r4^dNV?vbE5Z=GD zn1#KERP+y#q-lB@2^bq*Y`s5TnmFKYTtElc%Z+D013b$BjwipYGF@eEI33z=V0M11 z$iBIRL5fC`iiRPC;mZB0CSB-KXx~aUToke0?QG_#?^o;8jGh9Q*L+0fBH;1v@4MhPv{#Rot?U-} zO)f*Kzia9jiU_Bs_D&TJZpfW!&a*ueVsW|OGAC!ZTN354dL_MZ*zxZS_FCQk>wH*m zE)nSUgku3CtD+&~QM8j3X(Cu-Rz5$*cCjvvuqcw7hA12v4!28jG>GkJ`{l%TaZGvG z5fsC`8ye7v85KG55AU3!HglWuaT2noD*JP@8tHpa2Zj0ER3W{_k!&sr`~=z!#yZD_ zx(lCOg0l3n+ml6l?;+7P4egEkv*NhSUP!ORw1CEu!=yu-NGs+u2KpYDbTOT=g|mC3 zb~_7plDN~8A5p>6$X&*+Q^$`ihfVhw8qTEMOq`Ke5ie~61r}v=B1Kk3dUNc$RcYNs zdg+J#qPV+l=wp+@{gDxCXU;P9jY9&2^@5LP(wR~v=kzcc;J zzYK|@T_gzbf0156mlTI+$7%!(mpJ1p{4HUKXk-ttVu*X5Ml2eF!q+sO;M_2TouxIA zo&8q$t=M*CIc&Kvt42GvftGd7P9F}Oa;l!fK|i;z^=Hdaf9uC5K&w-DoyAto+tAZk z>rTcY`i61uV%73obCa}qe!iY~DtPqC$4{biX86gXzN2?UoO?&VQDkjPgh$?~)WNM% z*YWBFFRhd^&Tf7|f??RXa~&7DEZR``rpfz~IwdfB321y&6=cOx-tG;Sg|&2?kBnIH zPDlL1@Ke66m;l=2v`|_Ya8>D1r}lj@ACPT}&Ei^kyz)Y-RhBPy?>JK0ey1o)GT;B= z=%u(pdyqTTTbH+_{u^dG#!g&V-OS`_qx;%EPx^SJ^v`IMu?CO~vbAY+akh8-3_%I~ zfo#}#c!*GH9zxd|D~Au13d7j=r1g@L-GBXJPNb&;Q4|_2E8q{Voqw8+|1iF-boh{y z!C~eRE9@0e|Ai3Zs0@^+bTcc! zGpfl=g3xe)J<==rA^98On8ltE+%jHZ;+ixK^Ad1=G@v;^4uOi$L6;R)FZ3|Olii3Q zPA<_eO6|E)=WxVg7hG<@tu8Bf)o)~A;HMU)gk*XMa@c*rN2f(XC^`+Wr%4C)hJrVD zEqY_hJB7aI`%cz2toE6PxiuAMHH|vF^k|@?lTS1KmezQZ@3wf)&t~Wtc5^TJyrw^= z;Ml}B_h>XKVsw*^w|TJG_51x;K+V#wq3`K;nnw0)SI1jRNq-CpX(0r;fVlr*JKJT^ z!6Sgszx47WI@z1MTKm>7|Ke~+Z6vyqG9INv@xyA2lAPnQdZEm8K*?=`QUH&`sEJ2 znQ|-qctwK^AFggM^u!OBCO&F5bMI>(3Nv*-(=OQV5?-WR*wB%&rshn6*%o72=@L$Fy@o0qF3sL*4xU*5@P9a;%+kM=^gE!mMPcHl*MkI!?=yKqwLmfF4L zrXgKX77EdK5x1#3P-Ec^dq$F6KSH&zN((K~i*&f@T+2?`n@#(u+F~`>G)A%>0{57?_ zL?K*HteV1ceW#kNo|0aCzpkE985)`JNkwfwhB%>QLEAl}i-Y&`(J^{tDZ*f8Td(W`N=N;hQ3quq$fkn6{fokLVS|{RPmy$RJtFRXRU8pY6B6} zvF?V21SW(}BuFd+@k|ns2$)o?fa9A7%SI~Z5UeKxfaNJn;r7Mmlg+Z)4TMl5P80&= zmc;@0$N^NK^dr0NC^7&B#?h!Oepx>A5a(wzN?<<2tKE)R)_?u(>ZRXAts%nTas+Q}nZXmA%2{zLyVb7PqbH5OF)vtR`W?aqHio^6* z`y2+BRX1F%G`q1}*2u_7^NphiH{2ljce~>TAj?}IbmZseXzY>+VDk#qz zLw`veM^4SjkPr2mB7aBo_na{~IMRXZ=nJj$HK9|-;STIk5X0>y!&{F29wL5@B5JH6vZ zsqW%f>E})ALp5Q?)+&tJymo_~pPT!kS?NRSJSCB2{slGZ7NZ8)#v0#lMrkMA>a;?7 z#Jh&l=;;yAlDrbnSDF3o;nU&q`X+22Lz*gaWX0kbhXiA^<|10DyTt{I!=OkXm4xtQ zeRouI>Z842y#6vSfmN?sGOo%2Ou&amUTge|-Rv?YG$+p<7n^`Y-@i-2xvC&Mg5IIA z0_hn*)5B^F7MtCO^y7MlYs3A2Tz%rMR6So*))awf5a)P9Ps6g8cZ%zmdhAq1o`<^s zI-YT6UHo%IX#{| zzkhrzI*>%QH0ogF&Az^Fbhvhw_El|ZSfbqRX9`n?PmmQ=0sshv7e3&cSrHAP3J2FX z*(8H9lxSnlE8Z*VM`BTW08q<--;Nr_OltPU&U{HB zBS!mtT+x8P6>s1mlare`ADpDBnxa~S+wP2D>qb_Al!PxF-V*4aWAeQiW=lEpI{y5Nu4ELAe!ewn>tydTbp{hi(ok~8N){Ntf5 z=u1IT!ipy7-R4lt=XsA2#`iI&>K{4u)w5D&gj`Z2S;J~8x06l7tOan$Q;J<=&>Z{t z)hX1=gcAIjb9;T#jeUQ)U{oa=XIUl=d^iVuUy~;r2YqrbHLYJ?u75o7j^*u;Qv%$itv8Fyb&7KAhMEveF_Xe^#oC4#$#@qjAK6VwB8dR2XxDej-H(8KTmV zNMCH7s2D&n+=wN{Nv-J*lqYg8;+G1N6JQ)3z?A}^uEn@KJq@Gn_Jh`0Nnl_R0mQ7- z@JHd4Fh<}6x)Gcz;#~%C&SS$g{Q{VHWrP&N2PW{pfX>G5#nvPvl)NxzlDVy+#s7Q^cp?A-T|@^TklSNPdJ0OE@5EsD^x+cu)_4mPCYL z9%N^v`;Bt08P+jU#Uyb2hV6HF-7~p3d3gw)J@P!Uu%sNyCQD)i1VC|FF#sx7P|Rce z@O3K_f*6W$R%>pOW1LbOTwp8U$;pQR9nx$9qe8#X{j4(@Z~mT7{i7=Dp~se4?tq8% zbiHGp{*QyBqO`d~D&(QRsVH=QQmrLE(K5q2))P z2e1U%u?REYuGhPeIGEr|*t==e$QRCA2_@ms(|-^8xYLPokE-D?J(A>eHOOlI!t zu`Xfp2fY7CJdg8&S1mgNe(N+K|NXy)_1(yG!V1SR&|YnLxs1M2*lG&~%{;$*GHM=W z{> zM>)mD+w8uKE(U$XfS{u7ay^&VNuDNinW>-)k32Z$D637&Zn|mY!|9&QVO#1bi&U>D z%YTKP%fml4Hv|8uANcJC1x#Cr9PL4m01*IzPXNH6!cvn2NGeK%Bx~62pf_COc3xoo zeAN*)#L)~IlpSoJK!N0tc1m{M;5Y|NT_jSq42`me;Y=tG0!)q1Ux$N2_G10>iPh#chYQSi+(7=#|mli8bOG+gT&dHshdc~iLII|E?6r`oD!WZW`~-0fxIIzWytk>o|;f7{?Fp)(ykb?+3RY_uhVWmrqz z4M}pBs?CZ1H*17qbeR4buVXU?4Z99^=^k$@BiB|VOT!40NA|!E&h97(ZOG?S^S6j{ zuw1$F$+y+#byi&pA#4vmIAe{`tpCk?$Gb=dc8pj&y2 zuxGC%^(>wA%=+=4OOUGO_O636RlKe?=8*coo8bu75zFI zCERo*&9TNl{Q2-G^GJC1G?i%I;_mx+@zat-TXI#+VpU*owzIi#v!5^j@z<{&SL?CH zS)Y!M)1qy+6Gba6c%vr*j5@wUF6S~+I(Sqr*`cFFO}4i?`Bl|1;gBf zjy~aL(|&gRU2uc9i^Emap1%NwMoOyS?WV>!AU^l+t~}?snJs<0ZUj`^5)t5Ooh&a! zPmv^2#&uyyb!4lNK36jSSwJ_UNhDQtlEC|u0)4FtL$5{WPF^PI$Fv?ZHGkV;#X`}- zJNf66cbqtpUNu^FS{iPNEUq+9DqaTX_L_V&RQ#lz!nB;sY0M%LvFvA{A9NA`CR?@j zqunBUzij^Ifw=j}dWME+Jv;6Qoqiz7TtzERa14t(YvTe}*Ydk$4nd-ENHFi~nnFT` z_~d_UKI1yOh<>w?Dh@^cglH_KVwRVWIJ?h~)f~&*E&4)O6YMe56{TjFs{-5pYthVtLJ8hDDZi1a#Y2g3aQGSr1#67sjm zpR$st@0Wkak4CyR8m9NPLX?S=w5wdi?Afa4E}HLrN-Iuf9yX_B)qiTgg|)K@e2$6| zT9MMuRi#*oDQsgb*U5Ayz+Mm7h_-m*Sv~2uaC7x1SS;ki>N1%GT3eMQ;;GZX?Nhq0 zj9eyoeKvW8PDd%rt9Rj7+^2>K7iwhU%|ccq+VsgzSB*8JP=O>d8NvxN=_uL-W^ciE zV)}v3*{oawNqi}BiK*RIet{O3QYSH|{o3K0<; zj(f9dLHPAv_*cHr z)!-=l212sQu(XsW_)%}i%$(^}Xryx2^13Y;eNFWZT1$w)Hol!(^Lwv9(zopUQQ!Tt zB-M%IxlqU^%Z-1Kb01rof0ZMjQ|$O)cIBbv%=+G!e4Q_R6l%5?rdi&r2k%m}$^9UR|O$S5_r>P{h z|6f=3f~$}`zIY;CY<$RwHHkT#eW2S&gA=Yd;Dj?xhh{bOg)WJFKxIBc!$W1iWvHZz zciDBnx*Nl%4skflcrKvgU8yDwN6+396|!|{5S=LRipSjIA^2K~J+4-6O{VKIH5`i*>-#zHls@hJ8|^QB%>pD+SH^ zMISFEn(OPk$A}GPv7b@skpO(B(u=zXEfIL1CDKdnVcWD>Ufau3)xp=wiQFqk;v^Ly zkl?>x>?Oz@Vv5qnluu^xlgwuE=&4ibd!#JLtx%A7w~Wg#3y6p049Uj13E3fl`%D00 zeoqe^;s`7(Fy_$oy0MsiAe;er8DZ_gEsXVETM-dUo9N*KXs2MBTw_Vp^nKr= zPofdUVXh6P2eZ%jqk$ta05Di5!s-*tuz}6KPVO{4$c{#r5D$Zp2t(99qw0JeOl>VI z3&hv!a}Ro*Ycv{> zWTt+Sv*jbdc&`G-0Y=M7{SvDFp=DC5oBSq^Mpl*xV%oT2Q8dWjBrz-5 zQ)s7(MPVyW2?FG^8-DmD24liD^N793yQ;7|!^j1B^u3CCNz}oj7x7WC?2z{}#c}pX zn-=RkDuj;btiIh_NA0|y`x#yxp#y1$p<~Bab5yt4HW|M|cwok8Xu5DG?PwfJ`UF;D z+<||2%cV62dALl$JPzDh0rOP-YR;epb=+xo?9E!1Aw{}?4@(FPPMWlXR2wMs}}=vt-HRz=8O(8q;EFD#2aBQ{9fFj+Zxx%ih`G+4#$^Yo42P zL3dj{kV@3@y`TN!hX4Ge8%IwLHM7bDB?!kZgFI_4S15e?ocN2wPM|82o7h#UFk$k- zex0qE^9^=*f%2HMLx)43$x#(09!JX=`6sKMCfU9CQic}46z~$P8$3RwO>c}CVk|+& zM4OhP1>-5Av4bEQgtO~dYQnj8V$|mh8IpL&IJ7?j=_(&^g3YK7+!Q?t|qb*e2F%}G!<@aC4v|Aof1SJ zfku!2I`!s!i5AtzYxDA6z1l}zq4DzG8Kc|X295Z9Dlu?6Px#GrCPhB`5Nj;jzq|$I zv`y|xQMf>_Hyy9;DsSjE+&tB~C)1$v{sm;Y*x^#WaGVlNPN0pDnjL2VuZ`maZ?46E zXxpB-y1fZAt-Z}v0=`c6#G;#GYZQSF#vJz3S`nN0RG7?wA`wQpy@8Kt6Ol+{=O;TQ zgDzso(&5 ztMq;N{#ETs4+R#GzF7#js3hqF7QiUU%&13kS|w`E@GHFMW)ChqOqyX`@&y1ng3I@# zakk?|K=AmOvTmd}Y{73xaVVii*2PdUU}(MVHw4ai!4%UTs<##zUl{FLz7)b;RkK?u z)+#!$VmzAi(a*1*r^RM(_)*6*)ruVr+ix4-Of3*QdyTJ9iKcWnb4%LPf4E$BFgWXe zUp~8y8*5LzX8z<;Q!_GZYet?*Rj>f~yobtF+{oK@8MoKw_?3H!=WKB=H@7$L`6HtE z7@CiH*bag!_*xIAAe$Lwlff0tc^RMV8t=N68U#b)o#WE+v(tDD8$6t%h@G9Ri*>3J zw}|?`{FXRv5Qr(`k#{K!AdZIGQBn^l5MEMP#|SallhKO>M)MLX*%RZl69e!g1jZ=w z_LS|mIRR!BtN?&kK=*8#*t$)22mnkyql*KE>f$f+lKF8!j#|-{G7wrqtl$U$zQCs% zSu{CB5LeFv1b z2pq@5^um*HI5CX2)YLoap&=s}R2W%TID4PO=6_O{6)roY^zH-%U5=i|PzvTbI(0%> zIhF_Gu1@ekf4SFxKG|uFgwj&mwf$N;qx3>mWeSeX+FEqSI~lZdsRo^Cm%mhf^Z4J- znJcPejb)c2)Bl2vf?(C@;v&pu<3wfG&6(#@g?>li2vOq2dosFarDNrrbm3h70v;q|^k)-mS zj7+yz5nsg;->A{DS9_+sV|Jb5q1J81q!B4~a$U$Oqnf_O&FNQADiRdykwDwF5e*!Y z3#oxu$0SJeDLe zjG+>RN_kElGqL$f50mi40=)-BEw1Q&!_H3sWYM2%1dZZ&MBJ}&^?$kFH17!c2Wpk2 z@fXmpKu$EQfvGTRohYq0046el>cr@Kced z^R6c2GZdC)YP>4{RM`Wyd&_e5)$4S;yyf)6m3&~vABj-1BC zUt&>G*9&%^0~+m<^0FeG_H-#7s+vo*SFWI*Za}aqr#Qu zEmJvaJ(xK!w3=@r^zn?P>@&L@#i$Fa$xCgsMakwnB!#FyzP+P%-`{T?PleMApAe2I$B@ zFW_&e<}toaSg)eNzkSUM+M-Yv!^FAdEj`` zvHIPwZU@s#hf}khXs;+7o_JYBlbYNU$NyJ;e{`JFb%?|j-K$Nyk6Cm~%T$jkHwus) zB81)gsDy$ezK@d$tGmzVM74A?B$OQUeL8$p?63{G&I`NvS!npLVR}_U3PRXAabDmY@tq$5Y6f1$(giM? z%~AK8OP1;(1Gl$p$Jl**Xg}y|en>g;J-fKXGZ1{xRx~f0|7DAZh^)C4q)@&utbbbe z+sAK6);rn!^mi+d%g3mBa9z7vqfPmxpT zpPW`U*KgRqjuc|rNUY-8&+V^{dlp9f;Txv@?{edB>-={K{ieq!A&Khpp_`4R1qw>?! z(%LiE{eAV!jGnx>l)Xx=Pfe`!8F-B8T8(q|E48YA)_WKz1w2KLO={2q^@&|b7DD?K zK}3tSE_P1WQ~Vf>EOHj?)X`3XzW{5JE)x)Ndwlhn%5%Nt%$fix(x-<23QBCwoXqxe z4-&BuW3I>R&U7JPn&sV@J}3hgoc@ErjlVYyulC9`gJK~KrfjpGo7OS^MT&{S4pne} zpwIILdto!PkIy`MJXXH%{CS+@dZ7E5ZO~z9`SbBHbd0Kx(&$?5g%KsU3V!_X@L)-Y zlvp2CD8ooT_d%TvcxrR^AfJT11Ehd{MiIskaVysI)a!rZ((o*kH(5-9mpl1!h$lW* z+FyKcy1{eJJBO4p5kxDr7N?Wwmx?nv^>CcM&Px(C)l%>!5wz|#Ao;p;z8v39tHQu; zD_plS)0SH&iv9dJRWInv_rQTum@NL61OjzO^E(wy5)HZaTJ4rD0Rz8ZVG{FYHz)(-nI-iU?)s|3=g;Y-S6rDb1LMvl3|W%Jgdra?eD)FU{68_tJPx2 z;!oax*B5`Ksh>0F3J2zgo$@1|#BC52J1Ow&VY9#H&AgfvT^*eN>5_BSxyc6;#5jz9 zGfq*NZZQ1M_31suDE%F-!~t$GHI7cRa&qn*li;7n$NL)*`9SiQuTyz=g3bm5iO%#s z1O`aXlp3JhThwpRkFF31Z@75F{CtYI83@+OEO~5fx>TgIp8BDjmG+m#?mkz-Oy=4R zl-pR-dt?2~YegI6+g%b=$wO`uT4-|l{?Y_FO^aRjF3(ww3BFL)`ID1jqB%5;AN18U z)?fWc%4e&uk9k6~<;Z%~*F1CXbvnkA`SCu)|M<<_+1ityL%_&q32O2-+9wZC#TBLC z#Q0j*eelV8nmC1!88=-dT1x*L17W?Q=;6p^15u~%-_DdrBP}z%a8PE>7^*|OR$P`* zFEDuUBpH}E9zKV;R}W*N4)-#7bbjf2hrRKgasFBv1;zuW4bPbUfkL*V%)u&5$fu!k z3$Fn;1sZq)P^0ttb4p+-Mw9?w2_#`~@ZeNclW%OLvpFEwi^;4C+SG99X_3G=QcB#i%wtjZqIv1?%exb!5$5s97VU~9%f_#n_xJ?x zCTnm&7t%2y><$k;RF;Vm1=3R!j{LQs7)kIip1%r95kAe;=25_ASdE456@J)>p@V?3 z=aH((nFlIy3m^v^bXPZ8mE8uF%#ErKG+Lr>_@I6WS?WGINC*EninB)4*A3kidG?oH zB>s)N#vtj{dh#GF*HoLW7vDuG8@_hAW_P_b=>9R&*`~K+p!?mI|7J30NMgQyKeKIV z&Fh#e!ER-2n*^}`E28AL58If^YGHS%ZI=6TkcB3sI0!rH>U&!j-yc&U^WqC(IvVLa zn-1UKRU>+%YZ#;eq*9>QS*00r!e^%bS%~g`IWDR0tKR?4^gyzZupD}fG(HntojVhr zG0!#lwgQY0Qt9~)%GLgS;bEun)V7I4m5M@dRjfZz6v$%z9wr#%=zSatd; z%%qx5D`+?r1ve4)i=759At?uuLtK1mR$st|nkTG&lLRI%W4ft~F2}8uzPnnR7nQdM zkFEJ{wrd2%ty0xDF8`=+L6rqa8(E2`?x zkSfTRd*=#0iU-PJ?|43R80Lp<@Ju>=W)6j@k*`BE`u-G{Zrom>UJ1JQldez5kpX}S z@sD!qyR1H7h~Js*0-=yqGqdBMril&izcW4HUnWd=*q?r&5LbqQ)j}D#(FnUeI+$b~;pTy9JbmZexk!jep^hrjjJp^!iW5RA%4No)fLpu#`j6;%q5xg7 z8@3fa{_v)M+}pGM?ivNgClV8D6bI>a3@-{dCkgxAviYU=#}f$I3AY#N3iBWf4wGtl z&#&8rev7cj*bg~-E|rHy*9j4&0~%s_Pl1hhGb*o*M_OYL4G7owdM*En?wR(oEW;9O z4A$(2cmx`5W!>b!K->zc%`4(}-UdCqEiO=w*8G3>#W63_PK3#(HYl9&RRpShN|^bu z?C^Mk)RTG`1>0k+QOz@B&-#YxD`N8PPX%roA5s73x7#9u<_mgCj8Ur13_d#Z ze)ew`ZyCOfmSDE&@u#^0wL^;Bn(I7OV!yO(Q5AK`7-bEVhfD?Ks2ydW8Ba;IJ#fD6 zvj6eU|M|rZF&SZZtljTSIisQHoTIG2rv<7{S!#GyN<~>lbIsAOgJg&8@X`8Kd#GLc)a-xv)cCm@N%{*SXVskeD2L_J9GCR5JP8a zLmB(_+2)4~{2Q&&xFP~-_Xr$CjBIT(UcK6j$@DxwMwcg&9=o*x*N>s!B9qj|4G#Hz zbe-@KgWk;6d9$B;T0&wGV$w+tkiGx(>3&T$_s}BoG>PU{{OJ7rnqB`=`_rz~wZ?F{ z_1wi6qXD7y&8=&Fm!UaA-ER~LpNn_=qU8t2Fkk=seGU{GAbH!vum=%3q2{y3UH^|i z{3WS$+xEn@sM4Of&arnPS_RlG(ps8`xY z#zv+jBg;IwfX3 zO4%*0&c7W-RiAm6t;{Br2r1SbRa*aMXT*R{d^yyVW{In+uD6l!5J!h0TQynV&u9IV z@CR4OhWx{^T{t&+RX`9$g_?-jnH}-`c9w}Gccz9y=5pLS-=>_`mDVhf))Q5us4y}t z*JGLjlO>MqEwwu9QP@~Cky)ts@#paW ztv~tDcsG>4?1-FC*M>0Rr8u#3O0JVpZr2~(-Q5&$2ucmV&Z#aeRB2_O3!G@vE_|ow zNpDrAsy>CvPkrUkbc5Wy8v|;UBBBLdjJrgVzD5I$#=gV#k&h2~{WR>a&OHlWPvkcO zUn!KnD<+%AC*L=q;Y5rTGtxa$CvzHW3We|s(5P*UQ|f(s4zLNa<9zdqk^mkr{}0Dz zIDO@j=G<7E^5b-%HRwEWdSdp2=QvW4RRktp^0R3h8DwFXQx>Hz0 z`ouH+yH62x_coZV3Jk9#(fopt!~GQr4g-bT(-Wyc3urZ^$P4w(BwNs^HaQtK1i^wt zFCUxbFfg!WXo@E?HLACNJofF*TsGf-6bph!-SEG)9o{%8O#b2J2mr6wuvNW?Bk;@6 zLotbIjg&+tVxi$j!5p$$s+$gjG}wBLeUXDCFV*|kjJRA0402^U>m)QFybmK*)qm5k z5Aj3);n)TD19|70Aq_L^9Kk+ThqD&G0>?iBE1dPBb|jT1UqZ9=|5kZyNLs`ks^b#% z;So;R&IqQU-FTK(yihT4rIL@-cF;^7cez4$a3vt%QeXh6XNDjGt6#5HFjPN`PBw&6K0P;Z& z4r+?8qp(#olHOnhHVi4kQ7CAV4K{G2sw@V|Zo97#UG*lx8V!IJ`BD}psGWswhQ<#s z=im`sx{M2zG`NT0bbLiJI3QP-ytcSE+fS3~nEe*UH`UkYpWO+`=lm7vbSQQuJ&zfGj>~ zvN130Ms27TMO;e4?IIE@j2GmSvQC^v#?v?%{>W_jGuvW}+)dk-PoJh786702tFIGP z->2fKCy?*>z2Z#DE~{SJRc+;5h|BW91vvnjp{!TnW`_!;EQmMwTQBJ zk?EtUBth8T{==UQu@J&O${`4SgyeO;+ZIUGs@yo_96QUWeVt8Mx5@q^4;zeU)Mkl@8ctVPff$p%U>>Ot zBzb1g^gPeyxwEh9RA=#2oBed-L9r?KpmWr1mPx%<@5%=MVJ#V*kr!TZ)OcM0%k#YN zwH9_y(i%5B&x7o;zVGH3j20Mbq^~fKY^^?qBLMrTiJ>nwzqM37{P z;-xr^96$NTwLt^(q((IpN2{%Fir;J2hP&c@SqY|7=qTW6;3iM83SDKXaD@i?H`;H0J6+3l#hX^!)Mu@&DSVNFS!fR;;)RrQro|{H+M1t9 zy|?pewD~ewYjz*%MQO+&9eK3pSrf|{iD?y25LzUf|JO4=fh8Whn`>ABG12+!Ztn8-sk0S%oMQm=)K<&pN|pnM^NSJy3weZ4 zt42ZMr^$*20s$rcRHe5HeJ`WATvStRx6xTyOX95CIhFW_-+!H~S70;Jxy<+3i^Lm< zgzci=kd+w1Hn5qfO3HfNIUfv~=W>VPZy0`743FT0WFtKWj9?mU2(2MAaSg^n`~Oe} zKV7VM6JE>>?fhJ1v{eCOVKA!LB1BQNgCkZCbjPrN_%jnWP44hmd$XyvSVjK8(wnouc(OC~!>7dKg>)GL8Sb>jVFpxax z%Kf*!ke4N2uEQTCI%O}Q&+grQUWcrM%Q>&P#ch9jHdGcXFAmy+{ET1-9}t2@U^^$m zpCL8E2jd3_kO)len3_7IC>BLjwx~Q94>!3~wo4kEZsE5g#O# zpu(W`3{P*!BkLueDodkvgL$_0o)&l$Ul$BXK^p~BO>S3Y@mR<@v1=(S-y)~r@=3#{ z2Xaw9!YMm38Q`Dlr@$<_`74naL?n|9Q1a{|B49AkPvG zIy0gs|5Pa_lvu!3mPed6MOyy(v} zvpFM9vFW}x3^+L$hOIeu1|R>ndmlPGr}12$cbo5T{o{|vd*v!YtXHO&Lcc@F_8X5X zA|~F4YK+mQ(JJ@*#(B0hJpB3m`1fy{$w-IGNy4s{|GV$+zV|qPXc+AZmlUzuXu596 z-w6vn(=kwg@(>9aX%;BB`{ykPd{<}FJIeEZ%Lo;=9qf>clKI_uUHI9vE}Kdi0l@p9 zj{0o|UiWe(AR{A;%v3UF7_Sq=#~!2J zLbuk{Ob$NG&(I_zgSx~rwl692xNC}zKCHbN6>Sp?o~-<~`*=P*z1uD?TvS`@Gv;|V zS@*6yjJb17O8IIn*xQiv{kxCbb&vP1j}Mk#-KIzJ^5ah4aC9g9@pQsL7!#n%O41Zm zgh5gC9+(aMueNn1M=<#1LB|(}9H}-9Q50h23bs~Y3IhQJHj%(6>qxLjrL`k1VW-Z& zd~BITu(j3lB^>w{Is4?tsUnFKN89f+aVs+AlvLaPf|Lk1fJVFK#lheD!%O|B<6 z&LpQ;_U$!;&N~}z(E(W47^>m_=bQw!lgB*aAt{U)7lS{tSw05yI%1pZ~g8tAGPpR-9N$nDpaTC_u@RyXZ4CEwJDJslW8- z35WnvtPhV;?6qtM^*Y5Cj}o_ZkN^F+rSi>IX-(Djgv9_bqHr_-2s1Ps2W)_YRUs}G zNkkAPP9}M?9_%T_E-{0d07_)OL`6n7{x(MQ_sRDZoweZjpZ5b?wJ(Ahj{}O0U9~R zz^_J4IULf71xt-11jceM83;u(e}MU4!lGu3K}cwkWt(c=^7gg@xWtXyNnn$*xv(aa zybvWY7+I#`Ir`y6_O>lfJeQM%tajho#-8%65-pybOcO}W$e@F}&cbt^M}#-(r}TWZ z{LD9&el4jbY5Q@dgL=+7S{{mDc<4+}n?b2{yTK%tH zKGwV?H?Yhb0KD1!Q?tCxm>}afTV}fb{8#JGkG~TSu(wzE*lHW5su+lQPVsW#(y7L| zvOV6tjX^t&j#;g`OtHt8M=$w`?)8f%O#(s7IuXVAF_nvx&%Vh9#^QMR5-61oS|u@a zlf{YC<8py0NnkZ@zsA8f8_Z-9kZBb{VF~DDEKVO7Mh_k=)itODc{$joxJ8j-0zJ^p zVz8{3pFCn6um{SHCd-NPmnZ>_T0Q^}E*vz7gE>tCe?L%xRdsZ^ZRwhwEC2fY)c`~- z0zrodoIs@Ex)_hOQss!hZ173LQUi!{P6H8ogHe)<*TI)P*OwFY zfDMoWp9|o!oFF}W-ur0oA!-bvZjss%0TM_PwD1TI?1V`EaIks`U?bd!0LlZ@Mj(ix zY~eY7IsWW)UQT^&>!P)|5LEuSk-J=dZb1=Vc3CGbmdYOyM*t;7Fd)9r;-ewxX;l$` z04xY^Q!0R+cm^=ozFTsbOM?vS?GBu-l!9;<-2?wSj4d2O(WKLKe zY?lRFVm;4e%3TS&NVZYkz0<`+%ZXq)@72Y2QfyMQ$VH@3KJfz zNp3Mh;~;Giattwrv5}-d`Y`#56x2RVy1emRZ6MwETUO9IO?t)wym?(0xTEH#&g}F_ zP)TUE9*vTLbrD#)OkqM@-Em&*O2D7ptHt$)ffyrvKb47CagG$S)2f;zblTGb-lC1J zS%B@NU0(q+!b#s3z3bh0REb_mYO<1x-N~8V06NSFVem#6n#?XwWH&f7`bn zt@_$j{|nxw@xfT}hsKU9-Zvs|Lvh`RG^Q_Z7wt2%)B1h=sY)Z(r}W8al&;Tu+P8Gi zJ}OoaeTlsIWTz+`_Mi|v8mCCzqHWI%FMY=Z4o_(kX8QHi<-6Odd#zCPSxP%qOq{5t zS%2tnwrQgqm%ctV-h42+oNT)X7cn8F$A-$l{h92RY#kg2%?tL-1NhRn1mLAMDHfil zmr@8PhN=8{Zv3QS`_ymZWqH~FyIQSUw>jtzgqTRNi{j~M8WGy}nnsKWLE@q7KDr3)wKhz(T zB#q=DmvoPb!oiITqffeY02$R;T7yEiQcJd^0THDagk zc>KTmGq*u^KGOujN6j^#u8mM~xk4|VfgKXs5-opl)*DV%wb6r4?Q%hggDbt2rkR8k zi>f4iCP5Vwo#M+|>CwWZEEDFIW7|P{*xHnykJhwIg7Wd$I|R(mX-`L}AEX7&b+ScJacAt<#+ znc`1Lh|@R<+kO=|S3M38th7-g%Q`s5ps&@3YEQLXN7`(bv@AX@e4%urqbHuQVV^X3 z`doB+sqN~napza{M^=rNg3Pl2(z5Q&k?Fb3=bfrVS5`9YHEpF)`^3x=%>uAa23eE{ z&i9^L48zi=SkDK}qxtuO87;~dhY8B^SXP|@1}~K-E-CJ1G;k|KeRfTAocI4>-sbb8 zihR^p|M5q7+}zCY^+oniaWxzM+olO^Srs_-@NN`cr&2aX8QQNPYzdJsvbiL{<)@=# z-iMFahwYcDdVM8-_x@Vv*Y6sYy|ew!9%e;1EsQZbTx8{bEL8{-59F9#yBTW=+rFH(J>{gO5&#|~5zb}i?|RR%E%-c@LLOJf0Vxsxg(Q6atlhoN z_VYtTdsx_Wh$>OfJ8;O}_cJufQiTzU#sVX>s$Nr6fUbF0daKAwz8GwN$Z-G~I|WFYI^$HB4R+O(7>nZ1d85LMt=-g zB~y+2$b`K0^H{t~s0K>w=gmDOnL1iKI2nY7tEVr4yJI)xhwRELl!>IC)Nq)h!kx)N zT$EQ47d5Uz3*j$>}enq!2*h+yCRD&xUDIX#$qo?*Mwm z>FKXxXlc*okU>p{EWkL7WXgCz9C0=dDrYu(TCNW>5;IbHN0$O4BViQuRZfCoiXg63 zQwB1QkpgjZa>B#11q=%Ts7N?);E-m(Y#a@ZS4|v>=0{{oCf$$b?8LE2yKF<<0bnGw zii`joh(S%$o5YQl=uIe+a2rw5eA}S-726AkM7t>}OBMl0Lubfm)6-@E^J!>zAohR+ z6I~M`fUN_o2CK$Cmx#TBL%5ku?BH}snKh)eDmjH1y)|1vNhgGi!p%2XO-iPlgY+R^ z2+`~QxWrG0V`^ej>1C1tV}={WwcRn$_I2{V4^TI8cPO8W3DI%A3KuaG{k?H zjeTc*zm6z$T8xdgJLG7NRid2VO0@;cQ$9H9`0OvOv#JJrn=3IP?+lgZUvbIs!nfh+ z!vo4GZ{CrMy0m|KdNRLs;A|B_WRzbp6{%$L?jk3>gnSueBr$;MD2UmqW!%Os!A4)oMLr>Z~UR`#;1WQf4#6!4CyNc9^hcVm0&i`>pC& zCI{b!zwbAxXZ6c1);yjx;9{V$fnR3|b=$f-M6Ybt<~OW&)uf){Vz^NcYYF70zv$IH zvKt%Q-Y$$?!euVkc?T51V1c9{Sw{D!nVHC9o^boT?^o@eadF2)ZW9!`WG}nQu;(eV zgBTD2%<;(W-U)tXyk^XUqbm0BCmd=fE?lx47qM;mtKD>3G);4LTF_-^_428!<3%3wN>{3!+@DS$cV|m4s#O<6naS?#Xu(n)E zD&|xapD_>$xA3vm8ON~U0Ju|caA0#X4XPus>m_WX;_o1Ar- zb&QXGTMqmq{VpEfaV)GTxQ;;32l2yNyB)>eO=vxXqk4j3K&PAgQ$7ib3T{B2pmWWTrVaN(XDH zqn!vF$2G{2NGZE&Qg1X%8%995ul!nH_t)^&V_P-=OT}*r=4rj^AZBB5&8iptI#h~e zV)PA!;LU1Cfv<^osxXo6zg%W7LX*noUyeWcc1Pfco9pxsMo5vUJvc4qLd-R=SBBxBR zh96=_8|@^)U5GH5fxiIJVQTlPyMg6%9bqEvrY=jy@T#g)69EIN=^-ZHV49CadimBO zw0L=@xhB>5pxD0TRcJmk_9hwqfOd#iOZ9@%3`OdtL*{9005?(Txah$+u|FPpp&jC8 z$~NH(xgn1F>!N~d=(Y(VvnR<}-~Up0Jo~jR)L3v$1Wd@_rR6RRj-i>%2WOM!SierB zh8L*C(-ZQrRFi7a(7egWg=f$}q>(ccjr7V@-c|aAH1D!A${{K=rwMANShWQWNpa$F z%Ge@FE3f7lD;MTKgcu^;!<30SLOTHU=+rcz2C4-5lhfsclbiF6wx27mN*WH{VFXsB z22vxxs!ioA{fMS?9s=IhPcV>k)`TEpzrG?<*buGcD4m+J%GKtW-pt|?m8Vj2Xqnbv zJLzr}?oB7283VddRbKa(!YYpP+BsL1VXh18KZ||XiN-RyeM`=a!ff6|Z*4ub4Z^3oomlr0iDC1^ILzWiL?5yyzg=vLpwp%l!_G1|7xCd z29s0~%mt$Wjn7)@yc2YH0+r70geEWKbPZ;{PpfQ+Xxp`i&?mfGE*-dKl@N2%Nz5%X!DfN#V|J(P0v_;OPZPRxumFX%%U+UOzrgu74{MRbYyyhlt z;Ewix+|IH?`H^+4z9M;>Q2-?t`E)FGx*+xiV^5>?`y#a{3lAz>5Z*Ul#n?sy@FyF? zjs}ky;jD$2GqJ2n2fBe#+wmfGGOSWp5boD{Q>w!$G%+?I4yr!rx~0ahz+ZN>Qq757 zlO#&d+V9(lz9d;%%|A{u&xUxYqvRxym_G%1K8_lXeIsIA^T~NO4!EcoI>rs2{F(&Y7Vq`v> zlmm80O8G9TH=cl!v!h;xgzJleWZ?Fo!Y_0`|8(2p@fA69HTghw|GUyLRTC3guG)Xw zcizUk_V8eA_Y^+fQSBGtwjBqr8krRNl@**2A~01tgcJ8_A!7>t4=XxF*Gku`}m7Z}N zuD-^!T@-0HLDks)Zzgn(Tbj7wKKO<14)5)K%+OKuQN3~E4-+=AqtXa-zm3R1Dn4nZ zaxCj+_M|uFPoK?NoIgLaZ0d&?Zj-%h{IBP}(yqB^azO6JUXQ;}6fsLgNu=e@O|v%y zmgBQj-SOyn_2LoopE6S2alRuxG2&W^5L7HFDIUX`5yGKufz*dg{}-vWb?{+2uoVD# z?w<+PtB&6&XP0Z1N6b<15$9%Lh_`9$kEpfBwOn zUVkc1#&TuE7-}g}5g^}Zf^1gcuP*I;_>JD#uQQHLFi=c?8vlACH+E<4zXuUelONuQ3=9D(ig*4Fb~+FDA3Gz8ObQLv9jBiadRoQ4XcD>L z>5iyM$7-nIA|{g-VdV+tkjojYd5xu@U;Q>VW2JtMt9^oCLH{{ntre`Rv%Qm~=E{^} zw%>u$SJ&0|k76?08pokWA*m8?0$G)!*@Z1@p6dU5pOOE{y;swksie?h&;WBaedUzG zCSzy9JH6FiJnXim)QnSfmR1CD_sXM%(7N~-_@?G{5f2YnKaKqq(+2UOza4}a)POKu z+ZWMvc(p2t@#{*YU5G7P`8jVSR}w(sYa$b#V=C?>f#OPE_7z!`YC^3$N1R^EH)QS9 zk1$!&_BjN;*|54OXZpMU?K^t~PLdTOe)cwp>*%t>Z@wrQKWb>qz*qL-uIKuCb9b{V zKAIQWO3rlcN(!Zbdqd}6u55B5`4zqIJ%+o0LLEl7!NEFqMu#@^BMal8HZ{jZhi z*aU2GeqPpxXSiI%GPwkpiJ1B4?&B988HBhS0ab?matAa;(%)tl=X?ZRWRb$g?e`6} zLYPJM5lG)s1BN(Uj7xk-7ERR!L#4?$M~eSsO+_(NGzc;)iU4V zwvuJf!(SvUyUrz?V2Y|*b0a?|;*W79bB&RCK^AFtHEp^2+Pn$RI-itxs}_4AzJStQ z+=n=)=lss;4U(S%<vb zNUrppx^84%9lWD4bnLbE!r#U_@tIUaaVJAwE_1Nz_ay=v%6X?^Dnc`AE&oT|RFlCS z?)7^Is}Rk_HNMw_%MzX;`1hCI5+qcoere%>91{+eHET-C*X z2^}CR@^bf8`(l$zs@9-4^>=@Gudf4EroASMX9iNBKYmx=!82F{Gd%2G*vlYF>G)qg zp9qPVm*Sr$y$t4JC)$t6U5~hbzdxDcj> z9OHiCORK(`>O(&@p}WnA;_e@p!VuQ>{lqyXn}Yykc1_-uJyHI9A+~%SIoz%{#&b5W z;@W*H4|YPP$53L$@g(Vi+-7jxi0#<8f3T3io0^)=i&$yDW;+2uS?vGA%nu@PdBp7h zUSX^V1<>KaSp4YN;~BMUjQy)`Q|wg^y`KSs@@tW-h$+GOotZq@Xo98$`#c^^t^(<;&h2MPlUUNzk%79gAY1l%rIG%T%}Ksn zbX{3|eq`=1*GakZxO&N&66nY<$iA)#NPNfmIx2rMGM_M%`E`t| zqVGl1o*iV#Frjy`s5h!<=9*$ngl#~VV!Bj%ARRmckt3=9&L@JB8&JZGi77VvYM_Fm zO(C$?<}iVH;o0~}>Md~OzqJ{e)Z6}ngQqZJ*=qtVna2Ql-3&yLG7KcT>YDTpK)6ke zKeRX>#0eZ#i5y1niTJWqhxnu-)ERsPXu+Ml2f?AA67uI?Ej=5_yx4x->C!{JKI*+& z)%=&c)-l41H$vx4=-H3s9`(nWtc1Z?tvfOgtFxC2w0q14U=G`-t*xIIetxWSb56N% z|D5ewz;xxwD;+HKw*`7~{QTiFw(ak98rI%~E*2+?dSaraAw@PmUh-sW#hD9Y87DI) z@3u~TMv(LMSie`bzI@6RiH9_i$W6)D?F4>Q1ttbLj0LYGiWXLgF^}Cu0Q;^?et4Nj z0ye-x`avP{fF=m+La}^!92aIvFozU`$z!tuOJm@uWCJFJAh@-ju$&GY!~!Pbpr7M3 zp>)EBRS+u3M=E)D&IRX3-Js-P627oOUN{ItNOv7J%JkSS!9vAh0##^9t6b@-v?v`Q zI9%JvAPEO&S+}S;$Mjz|C>YXN#JV)y_D_Ky?8rU8$hrpM2>xk9zveCVa5*|y|6H01 z2DtWnPlIe)a`Ghxf22)<(-)o7Ot;ef@W{_p%ZYa&!5YTKW>W|`Qe4P^?aQl9j*9ls z%afudobc$#9b%mDZogDh0Z=Mvi$M)n!TI(M7SiFSW#$hxOhHtccQ~uoCc9~%H=J+V zC@o94q*NTz{(Sq^p>Eb^;a80nNs1-^^Uyg&yuLB6aafw zSjluS5OFT*t36N!2jf*DJ7G4aV5b6LB#B8Hi5WWp#OQJ2P_>bZgQ4d{IC67TQhx6s zimEN0s)$bmsQ5%Afk`GI0TAIf zoLLkGG3?;R6#6QE2E%z_Fko0M>BL3M2RPn*qa!9U4cdSjOqQSS9QOhjU$(KA0v|60 zz+NiZQoTr6moqwV!|?a4)xtD3w{55u+ia~Nq<8m9pOzqniS8|Hs;oSB{ zHCfGZ^pFT@=HRbHPFxn8_N!Omj{^iqLdp;}?TxMV{DZuPYR*E=in8l=oLy&r!ySvGUl9sXB0$m~4%hX+Gf! zv{>>7bvI24A*ZQ1_{%Dg-*DnmP%#d_(yJV{&$hMP6J>of@Yl6Jq<$U-yIar3RScl) zE(EILxbIz}JRRhrnGOpQCjavHspof8zRblxz*8zN+rN&I!t1*QMcF<1uQuQ?mS}GI zQDJVnOX`^%SO-|oMg{(r*Ona!6^bRl@i^8AOR3bZkL(X{-XmKLr*vQ!sCSX#gY^>Z z;H0|>uW^)9LO#Q(2ryYl=i#lm!bEa>A_Vk1*kp`C4U}0Bxkz`$y15Ov`MGI>@Ml^v z;xR9{pId!sXw|4&tJwW8^z+%IXMgsPi44dRbF|k2I7!kE7%0&~DT3``;&D=#ayAtp z6@v;M8xNc+!d_iu2#>^ZXxE$lfb)tRgTZqBu8f+$J`YbC59*lLE*~6E4pdxfT^)3- z-`YIZPh@z?+?Og31PS<@to>BtVx|4rpWxiT_~5}8Ih=P%9P~$*qud4>WXs@4?~~-2 z=Ct2;U}lTnR`%)D`9-npSwvao6nhelJ$uL4975H!8 z2cH*FnE+R304b1UH5_DLLJUphxUK|ETj`^8CJqXS@lLm7A{Nd*DM2no9GocW&-yIxD z;i{9sNYBBM@+20_06-*GLTylJXcTGGgxnmc1`LTw!Q`VF%&{(UX!_b)!5$C7)xo7? z2buO7h`0zxrFh%@2Jr}C32>Qs@o9Vkbkj6g&jDR*6CLUAd0_uJui^CY82Z4z=bG#$yMJ{g+p4 z!4Dii6F@aS7_T`_r zXSfxC^U^mooz?jo-oUa|JbxD!C_aX0*F1G-1Znc9$}%Nld|c{!ioN z_Sz8(Lk;2zcbIbf9@&hM^%Puj6j{ZjlNgl1$9;Di{JnX8n;CE!-Tbk(+!2jI`2@8L529 zHgR~x-r1GS4|?R#7j;=;wI7!s$e-wA*t%#(MwZ+EeEqbGd3e5rr=!L-WUJ6$fB^QwGeOgYfwF_mOa|0zzE+l+`?qE=G1#vBC?(o#8 z_(3J?a=|6`cv+JEsoFYsEx)C%f^SO2?>~K#`g_~`>HChNpwgwC!u8eVGF%J0oG@pr z(DI>CN%JW;aM(hr_q))_e(55#S$wPR=NK(dt*;}h&T)>#;{ID}8jT`Ofox(H?i`j5 zVTqUDr2N(YIIdee?yb4vnOwjztD z0%2K!g=12*PXp#j9UU6|Y}2zl`td!TXZ!XlZL39%)xbO+>cA5ZQ}E9Q~D!6CAxguqJ02 z!BEgPbWz8QEnHiwXnib}f1%vX!5&=P81lYN&%h{L>_Txq+_}(1&j49qT%~JkQTtbQ zN~!kLr>TwM$1-W)fUwTVT4T?p)e!8d3oMH8nWyO54SOQ!fA6&-w1Cm_{Lz;Rt?OT| zl)_=!OH@+zN>&#egCqe1HnW*+GVWd(t(c~Dh7no(-7RU#(((_V`}n;GH4Uk|XY$kO z*t%2(@=Az2Zhd%<;^N>=ssK8ah!&&2cjep;U&-5^PFioU?DRFo6}kCTyMd6XSk?Pr z;3k#p(;gzt1L7tp=0`cqX40Hb;VhH0|2>uwjsgq9OOI9P7j`5<`9D)Q-Iq;y(0aE( zNa`x$dlDaNzb}qZCeKEf&^grUbYEx$vfu4(N@`~|I-qqaBZ8E${B^PYJ=R)Dezg1&Fn z@z+b?>qIchp5m2bY)hvqe$JMmmSr%sT<>^;R4-&ahyP;2l=B*-qEKor>v+RIqISXS zY%c8OBk8h(Em8aG_@~ZIUJPsMr;xdFm*p5n;-C$~b_#}aow!h#?X2I&lu}touDLzG zGQAQXqof}3Kh96IBJ7RkV+#R5kv^V;{F`t5u2qj* z>ql+|ziWSg_`uA$JPF4aOljlKFWaI{QSqS_6I^~^Z_t#x1lRuj;b{}zpgw2zYv~^j zI(*#^XZ!H)%3&W*wLgA&XRe^6(@YwI_bQy^V{X&nmvB$dq~d}DEvDt!A03$GQIZDU zM>^{VflP<__0xXeCn(*+*6pQE827#hMeKi5OW!Q{2Fr)S||H6#2Q7Q+g*xKR<@eXR)NWP zpFA$P4tdxsT&kr>U7Xk}7;7wn)J0C-+ zUXTp`qs~?nljF-?y`|OZ`3-+^ZMZv7p3eL0A}mTpxuNG_;a=2v&yIx&a816kq}N9- zyMVkSrTHeFAJ`TtPww*K-+rCK&0Qv#=CqvoK7+q~hG}XHxbc2?PsL*>qMHP;Q+AR} z#8493ZgEh3oYj6I`Kdw3?S;~9eySE&ZhP}6Fl63u<3`$J?=r@ub_A4UNa}Opd~KYR zqwF!>5Gc;7tE^@;!K)?iXl|VBTD#~f>9AO=f4g2*SHr(a2@mGaG~=2~&8^L~GN)y; z6kvOsw;6a}cCy>LU){E>Y^FlPdur0si}1P(`CWKv=!gkh05B0UAzn=OMUaH8P)8hU z=&K#?`Aaq-dL{e3^(?n9qE%i4SPGa}K#&9-1iMs}_B^4xMU^&Ds=1T)HUGNVSt`i> zuNy+s&TUqQ*L`%;*<$eq`@`20Hy|jbREpyTN6XjFE7Hcu`@!F9st4y8 zMR@YqM5$MNZ&(6vHnnBVXDu0BLl)>q7K-lehbcMv1V_rw+ewgw$PlGlR$xo*Lg5GMXIGJckzkGXhXVwQRDE0(l)`S znhg0wOQw2OWy|5yXUoQ^#aW`t_EC0YP6hM@ZEE`-qc6D{v>%l!9yuPpqREd5-U=Eh zv9~a9H*e6Ib)c1@#h+=~Ov~GQMYhPSIQRWsjXtb2J~@u^#EUEkUn6)&u|-+r&|i4& zh1$ZkX^wJfb;wK%VS_PdsTY(XzNA8?K8J0euh%Vzo75+*Ev{K#UiW$37lLwCVaBu+ zibmu#8Z{qkSVWPad?at+Dn=QB#eVI^P!-rXjRiV!mFsVBH8Ke$qmY=$3i9W~!W@HDPt$x?PrwBP_u7wQy4 z2>Xfr8xRV6U42BM%=4Wv_vAcw3@dP;$=b`=UTk*&pG88#y3?`-$D_`#@6z;tB z*lT+~I2{P0@_APD4UYmj_$W82nW4cXw&~h}gU-Zx!iGc1vOhq_VVuYj$s4Eh5(k0i zcv{NAS?$-lRB383zEjIW@g}+IZ~Kp-L@e=j{iz?LB@*Kq2@6~%BRdrRKQ@Nsq!+Mz zu~gzH90Dt8Ce!-dPzm?u?t~ z)q8kcC51?P{^}N)IUy0}NupJtLC5i-%aG{WC$tRYbSCx~xUAx!0U%_ zFQ;&q8XmjWx7HSCHl66en@s;}ug4~>(JI!%BX17*mZ&<}o2MP6(p%j2|Tq%~_O zV`RdWshFj>#b6;xEN=}a0rr|OaWw_iGUjnHCS2?jLrf$nWSR2=Yt77e>WmBCByFv+ zAES*Ljh>X}|4D96w-t_%Z5_Ygcmv-sFlW=9n>F;%_eg3Dr7*MrPox@7L*w6fRo4W< zzDQrXO|_k*F2$7Wp~wuK4!L$JHJ-!?4%Wl}!x0w2hQ*Zri}M?BHmb%)UM>KpJS7*8 z^q$>9pAYlD)lxKduwTmU`{pIrk=@3dxXR!+$aZMZqIqka`YJnC%Yt?*<;{XSAGd@x%EztI9-^FV)3P|5k;IGLunt zdaMqubST_eq1J^abZ-+1x|E9)U-~et@2rAv8_P%r5*@@ve&z-%AWUYwlZy}U=ky@x0ZT!B?|2#7G z`51`1!$XWWk5{GmKH^;UFthI?o=cb1fz;U5t;nH?459~@W?et*+Gg9dg?(#EgD%<- zjJTsbOpEQJXSZWphI6<@LHGF!H*;n7S&!iF!|l%xFat^AKd4>~j!dyCibP00d>lxl z5gAEfaHl$mmRAVGnI=iF?uZ?z)`N@mW?63Tyf2TzePUIGqSY^qiQ^^BQpn)us=(xt zwj(1dB`d99Y6y{=EK@O{)BRuXW`kFns>-S_{Vt4ZhonfzEF<98sGzZ}KAXEzI@Cge zNBz-^(}4ZQZI&RWsd zF%7I?iBXDiT%#;gUI)+uL_-pT(#mTgkS(YHFD(7_(pyH}mM*!0z=be-8VVg^>**4O_ z<3iz}&{*uh+$z7#6(?>Pj6kBlu%FlC|0bGQr#IeMAH&cin-?<`C`td2w-kcu7Ls^) zhj5ggtisOvy`zyrWvdsPFiwHpA%b9HkYML{Yga(=*RRe&d?G(WQ$a;F8k&KZlz<$p z4!LNSsBzYo^T&ofBdv*#Y4DJmdPX<3rQ{d^*j14?({AQnOkZ~U_FQVjpE!lY`QN>~ zCwD?s#}vx+0l{BArWHUf6ijODrD7{GyW_3fveSm_onRFvGa8#SGAqol>wzOEUzBqa z012}W83>Luzbw$vfu>^}_xmJ{V{*;=fsiH_(OrbPa%Syu%(PlWm(bxI%g&;(uY-_fbl z>fw`V=hc5;(PwZFj1@2BlT(B$Gi3}dNPxfF})L-TXp8eQ-=i}CIE4sx_wtmp{kR>4gpRgRYpK7l_%)j^A z6xt=rqv_W&^rfUIWn-f*GgRO%;B7owHxPrI8LT+Mf0g~#txs)@>I>QgfbEOypx7h1 z{VZkc2^|WHtSZfudnuiy%35{cz&d7_xBQXL9si)c@Mw*yPi>pC_vvl+mrM^t&AYWm z-L{qmk*rL2OA$q}rKxk5_fan6r&P5ri~LWCE=AUcydRG0=Pe(Osp_({vi)lM%F+t= z#E6P7d0zZ3?pgkIYnm}91dI&=Kh!f5gx89g0@1(C$F?F(^*3sQ{rDdg!+wwS-|pm zaxY(MuYLJR$oaY}I!49nT|~(Cr)J!&MgIsBOPqu#a4Gq6Ao1C~TA5s23=&TY6M)*> zye}k!{cvxr6P0{a7m$N96?aIrl#fX!qwOx3)Ke1gxmTr`8pb*Kr32*S`Cl9`C zKg6j#JrR5K;w2WlW1Hp1%>JE_!~tb-8mRq;d6lcRtg?GHUKf`&R^ApeKS=>)?0O!8 zOeL#Wm7vg5V8wvI85k^M1@*No6{Vx|QUCkOA%AHvTnZk(( zC%ReIKWlbm9I|t!90*Dq9_VfR^Qv9Mj59#9B@$gM&IMJIEcWE7*{XXKXi`+z;Qppo z%JU0uMtoRL3es5wPWA}lMvIw3_FANPzW?cu@IXRH=BO0Lfgn9DBK+hKZ~OWG&8%%A zPN)jP=b4>O_&ix^D64(IpPsA){nveJIYHOe?#XO6@!RJJbdM9TUGHYm9~WF$FBzk}>5=R_rSE^dhO5)wgm&txy6A-fY`__jjn{DNN4 z79LhF4MA-5YIGc(An=DzP)UDsbrs);(~Ce7GFy?037j%WrFzAklJrkA_PeR179J9_ zD9DVLEC_#S%p2{Ep`EcogCG=d5|IqGEix<&>Npm_06}wvi!FoeSrrPEBMRBRj$`tLR)kc2Hw4v$14gEA{L2 zr^ubESy!*xq=aVj>NRI{)MnVSmY?&f(Yuy?dG^pyTa%JJmgm4$_pD{MhO^@MsExfP z<0Pv_(5{9$HDp#_UAej>b4p(?BUseEC3~{KBQm44ssRf}9)*FYNcdl<%&Rip)lb9p zN9if1F4&mLN?JsSf3amv;3r0PkBJiyLpz5ps;@Vhp?^9a(}Km?E{4A&ep2Q2A%hkg z!Kl~~ih-iFp+%Qa*a-uxNB5C58Up6fq8z@hE}m03jL7GNtau<;{aQDgnYjq)?I5t2nK^8v!V z^W1p@`*n*hip6z{@qz)K`aklm9kz93)qm~dTCq4oE+UJIBKUj{{9gu{b%amDCe^#2 zJvD#l$n$ggD!bfdGjW(zd=UeHO@)sc*))Ry4npFr*D&`nBfk=IL`GT%q4^?bOI8L8OX}dLT)-#{=!3#d!AWNVJu1lK zqhy8XV$We8qi|u&C~7UN-UJiNNA+Y3RtQNX3JXRLu!gY$*kR-VRu~kZXSNLkZVN1M z3qIxL3)p~p5u!@K;Fk!XlP-`-z21jA^t;k)4d($LI@f7yVw8Hnsn5TB@O_vYl``|o z-+23UfC>h{PeyqMpAUobx+=w;lIEnP);O3bVkiIx0SYw%GV$Xk^|h8O-B|-(@wm9c zoyK3reKqGQ1}ns&xd3V~tg>EAArZ=gKnP+;y2U_jUQm8UdJvmO+i9v}Wi~E^ zje7helkUX^NE-qicI5!3VC*tE>^yvSMK#~ny*pym81R3I60B$dlf(s5Gu!xAxA>^=psY}G`4mG{IG@s8m;Lu$*jdt zXlogadTU|Su5~EpH|qv$L+d6S6p|L_#QFpNAo5S79}-BSZ95TZZQGjWW}``-W=mE6 z+n7ECsN-?FjtR=e1f+;69StTCm^Y2mX2s;E-vs3-(WzUrRZ(!E_F!^Ix6Hj@391U8 zLl1+YVyoSvkW6m!_?^p{4`gzSbrbpbhza_FD;qIM)R72Pb*6H>xuRnO>uf{TNc=TM zM&QT)d_R6^JtJomllUF0@mcm+*H6nzsK@;|OV0%T?}ZnqhzpnHjh2HuG`Cn?F4+J_Y-~e9ioe z)_bwbs`jkipw8a;=as4I_G{-tJ(}!T?_I(+yQqmWBLFVv;EMi&1i(uqNM&xIeYcDVGEIreg&=Eg*uG3>j~6ve8c_9{X00 zTEVc^m$BV*sut7eA)8>aPmf~uyF=O7xSKFLC>&Tqnb5u(NkFVFI7lzwO&Jwo-0@|-Ox+NbY5ifFK}MC6>oLIy8JncUF%b=QND%;Ei&a-U!ydlKC~ zPaZ-5HXBG!jQZoBxDpf!$d2N{0025Z{q#|Lwr@qDBJq_H{m`poLzN1;@j;gw6oLkD zG-H4Kk86Dhn<&ea{S3}vn2oiNQ7?wt9j!%W+(fIND^ajj5wUtgl(t9cJ z4*R`(Z4h{SIQ29rK#3^S4O0=WC2Zv6njz1i(Ku)jPy;)UiBCEXBX${u7D+05w*iOb z^NQfH-~08ze)W?{FK`E1uoNaP=Uz|{^N;YfeY>`G**SP~+cmG|y{D8MzYk{8BRdN^ zL7j&@R2-;xE%CkxaM}uJv&g%aPnbS@%)9^H%c4qD8Qq_Ci>^i}eK!?ug@`f3SXCq@ z$+?CFK$V&aTyj_h5FSDfH@9>Mmwqd)t_dKP4{kpK1fsz)-0~~8-vE)4B6VX?ZSOm0 z{9cC3&C%g`;6+y4hS2bEHLZVBQ(rVPO;A&+jB-88tm6`S$rv-iDzWrAu&Bb{wgCty z|4K&b;ecKGNGovf!c6eRFHj7oO(%cTV@!ev&0HeWtnNVAamwf5XOZX!5$b ziRslGU5HxMi|BkO{LC0lm_B>{2c7r1F=fBKNw?OQqu7s^$q&Pq7<@eV(h2M&&YG@T z6%A5Abh1KDy`z=3R3J?gUNV}};8?7E9$Ykq2oVT9VsYJzZK zO-fimi_vkUUhg?g&qO7S9uLq!$QluY!AGm_vhp1*M;t$fvIfhEkYlF+fF(jFcnU6t z6r>p-#&R4Kt=X=p5&(>v1vo96^V;#@Ss#^{!)Yu7*E%j`mf%Y- z`1*~77H{@%J?~sEuFJEpQDW@6LO?zmLe$U%qIA2C3=^i?3;@$ebOYt*u_>@AXt_r% zUu7QQ8HG}{ntHTUUk?_YOVEsW7B}sW{0b_XA10Uig`_mJ9W8C> zmgW+S$Ez``xuPM3mIlmH38TGWNo6U#%{Ps$Et;6@dez7iEKdt1e-`Czg>S|%ChhRp`SZ=9{h zl=X{@lc;R;DvfFL9#(V01)k=t8Pji7JfW=WjB9r{zM*_mwV&pEy!OpxFHT=s+4;?m z6+!2H#Ga?oubzUX0#_rGHr;*Y7*e=4kz%j=5ez#9MoD65B2Ub4i>(@$V;|$>9OoYw zAMnMpy%hkg2Sh4OeXSV8Aju9|`4^wZcqypfF>A^Jw9pxWiV0(>pwt9i0E1A`K2(Sc zV%CS}vxUnTRTS0nDr1H-D!Ns~P_Q4SVc)31;o;@O!$v5AoVGow;-wt3*OfNY({dg| zci&3?%y28XtE>|U{eBl0dzxb!WuKk*YPq>ZOEgWtht%=@GMF^JidE;hSVpixD~Ijl zQI^84o{L6PzM*%Q22`YJL1^Dc;bHmRmE$H4etQ;P(B8t$jU9yMVSE`Yuqb32nSg3Z;peA;FrY@lE)@zSV)@TdJpwg}s3PFxS#lgrXp{9SO4L zIia5zpcV&?w7GL%DwDdu^DfdvvI;7ic{AoP_LZTY2T1; zQ*b3`%Q7e!aD8n6jT((c|8?_J+Zp_|Cn!vpIg7 zE{LIO`^(#aDJh;{<~-4y{^Abo{BM8RJT#X!_O$zxOQw@&Nt)@h+nSy0{coP~DX&IT zYWz40woEXLcX*?MV(r*BDN6!;fQ^gc{9EVm)_hbp!G&WGf#19X6ne-BF@&5Fe$fnJ zQh_~Ath!N95JfhY?uBc44QbLE6lXBc&x0*^qK2d`(UKQn--y>nz)^z3RqaM|wRBpkaN2Qx0vA8i!fcy z)P80v1pbjQiCd^z0dk7kb;Q=ILXeQmo4@?t$U*4WxP7-c_8S*ql==PLva#$&JA#Uw zlizIsQI&05B|Mr;s$Ra-VRT;z9=ePI&>P$Ooey{?c1IwnZb%?6A!dSaES3bqfwykZjzq zzgj|ZEK-En%kHzDkjx|u-Ki2kkPzbBm-LVJhL~hM zSe5WfuC;An9}90|75S6&%=7DVi|)tUlRZefQ{bR^#gnauN6_<#qciLMwix%p0vB}Q zFGKQ4^`N5(EL$ISNYq6Y>0F}#tgr{SzKnqtK@Ao3}Jq+`;j#jU>V#v2CHts*{87%9H!TPJ*q z=alIWI`ALwZFv9M2e5r08|wA6Yj{JhvBUZeT=NeaJmYtr+7wt#H5oMyMVbdA$t$Q)}<( zELc&DE)|{Uh|1pLOhr`9IgBrrmg~wj-cc^EBs7FrHFa@NFK&e5hT=OGtNP*qNGf6) zI0`R8XaY5`5d$f+sAkKKso%kCdZI@sacEm`B=~i0NyXGQ- z4o=+Mn>{o3ng_1#M?_CVk?w5~=~V8@xPDkpKi=DzI;G`ruZd@9$Uv7|# z&8o@1!H0b;nQS_pH1-C2>-H-`yrmA?RqZt|f3h5~_h!D7YWMwoCI6}Gpr6TN>c6~h zLg6d8=)Gkxj%`6n!z-B!pAM0-Dv>#E9n0CYL(a4xeJQeUCqC}Jd#VZFlb&nN$5_mf z%hlezwPht|izUZJMDhSgaIQ+gc+}OOsO(0HBNrOqMp9UoF#_)3fM`2n1H(RVqkY?jBh&wsX>-g5NVD%S+6}-%551Uq!_Xu$~8pOT&G@*pl&*^&kL>t-h zFQX*)ew8&52qF+``EVS(Oot7y#yzEA2R$c%YN)aUoB;W7-be}u%SsJ`o$j4Z zde89*&zT{SjB?dBht8zgxJ>I*m*|6kdEIV6F1Yc1-VZ9BEXA0^IChOp5+af~KX!(1 zNk)NMS((siy;{?igY}22J!#Y1XKih8iStLQBMN^Nshh}y_(XRY?jSn)&;k+~+(0$x z6k~{JQi*spBMqGz1yqQIQ{=g{s5_$$Q0^+O+V>q}&Me%fb@kf$Ezl1!>Y8nse%I-LqoV3`&`8kgq-?HsdshB&~@|OwN8QY2Z@aoiulzo5wXQUK;F*#&#t*Toi|g8-tQ<=XcT4z*kfAK21v5SS?GH~@VSd_c zMEC&k&a<_WaC4{D86`9*#ZBem^AHYJj(1$+){xxgaT)FnCm-xX}f zUp~PrM{iIAMbB}6xD=_-sqm}lu+s*WC2oCkd4wNQ03p@n=V)eB>0MMHP+GJMGwazY zv^%&*N-CC&SwL1)Sxe|dN5{|#Baz7UGYCZB=@TUK3|Uf9;|*ghvl+zLrX71{aHApt40d{wd$~YHMRMw#6u~CUM?M#R3N0i; zbd-$ezq&eaENIkbLms5pctS$gwj1{vdR!skfS8BUz0bU4LQ zNR@x)LgU*Vq6v!{5ub;E@kR@vW~zm`-$pm9U*{z8#u)^)nk!K;t_1p5kK4TMh**?j ziD=n1^_}_#F#S3YD1y~7q0%@|%>m!WyR){tT7d{NVcg=hVKaR5_vT<(NfZ2rTkr#h zuz$hk%OmQUH0r?3$b1@0l`uxtSzfu^IjtqBS;J(%rQ3!}j)fJUJ}T?WDZ>tNc_Qx* z?D3guf3estKg(A5(k_#!G(}jqkh;2sx_(V<&vg=777O<`eFo#XXp4azA3ly~M`r)s zwS0OsRR4tvNtwDOJM^+g*4(O@O$q3E^K_ZpHhmbs|9F^IBT|jcHRL+kU;{Kxdl0L(>(p={X1RjohI}_z`3?3b`lLUka;3`vBI?rVQ~5O z{8@ln%l)G{-FT-n2u|etzjePNffR-86H@%+?kc>aa%&O=^}9^QT3W6E^oE=Ebbk&+ zz9jEIEZ%_#8DX;RGSzCplNL*c=;=|m1^+?mxw4W-$M$)dDMv4NO6?q?Aef9A=OUE6 zLKGkoBWYA9FP#>MTQJ30#1$0f&*JLF#ePt6RT+E+OS~ost@i=V%0FxXkmeq14?mkG zN2-tEyT(n}!YD!Miv4h>SA za4~+k_Hr?_o|{{T1!177>zgmT{yV1gSkbF|-u6?^yc_5>tH9CCZoqxb7%P=*i^3t$N4Jx8Huj(%lO3u5tNQw}s&f7v8sg+#y2hfgI3ZG)*}g_v*pcj1KRK z-PJ1n`UfSgP&M7?fi%+8Qdhace0FWv=Yw55E7yM9O&IFzqod71_`8DIy#6+@ep}(E zCrxE9+BeHDFZ6i6j(wnqtpy>SO{E}|M14h&-`s48_5-A(pg4q}mOqTv0ww513X36? zZ;`yxCnfsq!zX7dL)_pT9pt{W$9Fmj${CzTgnFF~Jk3EyT$OcnJAR`Y;_^c8w+RM( zoV&6vKb}}2Rm7c&trqHc)5ClbFdQdaCKYFS(bZfnF_gTgc)fx!7fdx)7N)}?r6!NU z3u0R|ggzm?D*&la?vZ$*bTD!fAMf#KsGro z-$Dr&Ko|p11BPXb>5z+4)T^8Jvf(~7S+$l(X=>)2&ry|Kkc8RRUm1|$+>5E*%_Zi| zQK`x_*JH|NCB=D};m%<>kV>V6@9$W~3)(xYEte$>gfmZ~fu>lUVnedhK{!>U_gEss zo;E{2EZfyOv&c7#^#S8sp@$!T@v;(syw0{Vsy2^R)t~{wtI<41**q@!_D%)UTg2#~ zGaG>x&i1+59wR2^C$0b1?k_Ljf?aw(91zdJR6Qj{L{mMWrc^{+gZw0f_oH|W@XKVw zc?3xb#iHp?mWC`>mfLJ+QPx{p0*>#X_F4v574hEr#5$+=d<9aMJE4<k5Jblr*?*#rj zt?T@wt-^u1P9#5Vb&l+^IXtS@wJjlnKXFqzmg};**Sl(;q&RxBm#^V-Jau!O%Qn#E z>jUlZ3Zq%hs9G{;LMTbZ!DR;Ghczt@AAo|Ez@zZzlF~gu5dIE<4I|DL4`KDEgQF@^a#p}WjLW>6s^?!y#LwU|dgdkOjMz40|Lfm_P6 z{OMJs!b)I?%O9bcAf;jo+0odoU>v;R82+~nMie1_;`XUyAVP!4mCi^|bE~L-M2G02 zc{}QpK+65#3dczex*xwfxa`w$cVGl&ms>hy$K&c$N$u7tWw#rq8qmpm1zIAJDotrf1rYr@V6{aY8n0*|NShC4baOF)|2tc<9 z2b5PJ3xbI)u6$zgM~36o+Z2dRWM3bRCD=lFbwIajQ{hYB@nFfu6Dzq{g{Wd%%ca~` zx`-Q#z%!G@xOjX}?3U^m)-N*yQZDd2@w)N#z2$S`2B&F;L1bSH?^6ww!1VXb%RCVTDmz{HRGJ1SB!VWGJzCJf- zQhl~weHbwIMR>tCd!bw~K7~DtSi%&@!u;1Hx778%C4|a`PM5CneJ$oxkY{tI z!q1Rd%bY>(KD`eJ!lAnq{`&0#zhy^6SY9F$_Ih!y_Kbum1U*p67IYKL_y|ZATT@M# z2Wsa{T8`r2&>t^8s}Bwi(Jnhuw~f+!#4b=cFQ2Hhk8o~4HcgMb9plflMatvJ&c7D( z{TJ)L34~IV+@gWk!rL>abgn+ld-=$v!k-tv-ZW98H)Ij3>#Z^;$O6SewJWUOZ3`L?>dj*of!*H$3nA4J5ei-=>!yEDjF z&6D@ueOx$u=-QR58VenI_`_c5Mov1&KJ%^Urb7f$2w?J!FT^I5kup|cdYZ@i7(Qwn zt^~phCDvj<_DvK~G|S1TbF-vdT07n0=@)gFI1> z?+q;@A@{DcyHYahCf^uNj}}?gGp4e3H+BT9e5)QAK!XIM5Hn{e{_g#?okwnY$jjU7 z@27;$ij`S#2kp?@4A1p%cONjhPXQDHSUqwH8qRYC`plj*uH`FY)}B1;N_AJi$bl|_d|Nns{AL+<)O5dAZP z{Qy5(G}&r9tj4mE2`pMNKAHMiPE8^O+zif6EORPLHc5rMaE^OC7c`1aa6+rc>$`TB zOt;2Gi%zrcigojUQ{-wKeE%MFzIOoD{mCtx@~V5vweHR2Fq|Z$K5I^4_xDnSg2>Zf zEP89Dogq9-;vH_Aiq7q;k8ZopsVQnNeMqH$v0ivM+h{&8KEl3vpgU&=_;TuH#z_|& z>OhTSry@Y+MMZ?;a;Pv$426rM6v+1JyLImrZ4Uw;2S+84IyfXNf?YXAFdDoAfbAZoC#>0c7K(cV|T|HkFOU2Vs`QRil6IUz%-C%X5HK@FEdj3g7Esfa<8RuHgtX9z!a zpd(vSs4_xSbMj@=9LB*b_q=@mkamr&nLBwX;0^>+E4ScMILKzt)+s}rNdb<+?KLE) zaKS|5vmxy8^PB@s{pdls%Dp0rZ&$yqRsGKGvv7F)SYd5<`Sr}$_xGEv9lU|;lz@XE zR<7RkOds7WCYj(6RRMI0i2C*;O2>PkZDF;Prv@e@5qN3ij(ShFB}mSt=EYkzw|Rq~ zKZxA+)1L^>Dg+!#?sBH){sVcBX)!Y< z#tUd#uf_0G$Xd7v{P-)f>gvhKE6Gpl3iGkeG=@UYhmoQvWHTC6^>JYGWZ+zK0aJ&a zVP1LC^uAWRexfT*qV`_1TUfZXB*HQRyGPSvb56qjHsM|V^5B9i?Z1Dgo!^?G_U3MX zs#`wdjQlJS<>u^aq=CL5Zx0`+%ar=??YK(HfKmThnIC0J!|4~fFzpcz^ObYOv3HlU z&Un$~1RRM}GwCDkJVTew4eA3IPJ#f5Bbj(Dy|i=>Jfiei2Yg9KXc}PkKKpaM@82&NzgVTq02(8_^Ic&BLoiL z15;5S?t{!|;^(D4C713~2};WE0A5Ga>D$Q(nRkw&rLzv5J(mD%J2evM&m3$2<(Yh# ze|%OEuf@Xc%TBk+pTXFpl+nyV4ALNcM*^e0oKp^$vg%CJ`U?`eWAn`H*<`cLbcOjW zd#q<$rMa`Fva7pY^QjB_u|APzeD;cSmG#fbv=@dcY$_;e!$vvexk!p2rZK&TJ%!{^ zo`$7TZiOKS{_#Cg{Pn9ZxGcCczWe)v%pp+5(nT%fmZ?g!V5h?jtTg#_Aj6mTf)! zCw4bvmNgzL4J?*=c6N7$H1XzH(YH?~{m6&*$ED@zEsKmU{7sZFx&76(tmiuuqw(_o zxco7F>&bc3O5<&#FV_MO_sXJ`1zo|Roz6mvKSb!+js$7<)iJr=XrH}BG%i0g@M~#8 zvg=AU%zlFTIY1+dL=LQ8p=Kn@~WlpnZx-RTJcY#VPYjYvPSSq6O)8wlJ|56|Ort>}REgCsinyfSCW$%P3dcnPy} zj4Y$x73S8yEcs{i4xiin9lwpvsuj^HMan8&AtiIR`x-0dlAN(DfzOm`X$WW5Jw>J>lJ<^_5*ZOqPZCq8N^o9X_B;|8>T;(56S(hOd0?@N zQyKRN_6&3e&Rmo#tiWl<&QnsJJ@XY0!k|W^4DL+^iR>kjNqG9ZAV((xaKvqfXLWJn z3j4-?zQK1D?qF-5XJ5|OVW&y#5+vN@5d%r z%XrB8oE*k|@xI+3=p3U9oA(}OYZ|K@zTX@t%!H)JRtY8)4^}7X8k2sG7CXB?>q_L7)HsQPoZBD7EX2L9{b%zr zB<8Q#UEDZ4bdIIV{#1Q?mkSHoShmFOR-D(YiwNW|YY!0=BbmwA6=~dMuDFaeE=&q? z!V1$s6K2 zB~KTTrEEMw;cHvFWsj?y?kg=n@}ab+W8`q3`_XAYU8tBl=i%Yj-;4u$ z6I!qAVRag1QL%!1x%&2+1#kY@ybn3=w&=D`+N_Z>@Eq&(|J9|2+A*QiQx!$Jk7N^)}+D|x(Ji0 zg+@F-be7KlwYQ$R`!MUM>~=eTzoR1O@JDrArxN)Iu{QJh&FRHr{m#2qL9s#IA_$0u z{#YPYkae`oTg@)LV4jPIOepKYuOnUIbZxD;jlGG07s8(#_=Q-W{O226`~qA5{299c zIP+Y*pJpyi-WIi1v+Dl^0k5}Y`P0!2{Jav^1u1y@%nEMqmc4G<{F^oL&9(D1<)3Ef zgWNpr%Qru6TawRycbJYWG6XGfI|8)aJ!4I0Q-elcbokEF$wWrj;Gc`Wyq8YW=bU7T z+)7o|!S7&Mx~W?lY$fUab0Am6Ml$3&Tkm%iK7Bu-kg>tWKALkSQ7le8 zWZ>OAop)wd)}{J%EyPgS{N@;v0*-1Y90KMXq^jE0xOhL zs`W_Y;41eklQun-hmp>UO$=clqcv7gaEFg=bUOw8njCop(YnQz@RvnP7!_nX6iYdkbyvpe6A{69V6|NkcJIYa*e4i4Tm&fWN5 GUjIMKg6h=( literal 0 HcmV?d00001 diff --git a/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt new file mode 100644 index 0000000000000..e3dbef248d0b2 --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt @@ -0,0 +1 @@ + the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 65866fc9827a5..43dedbc394c38 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -314,12 +314,111 @@ stages: pushd /workspace/onnxruntime/python/tools/transformers/ ; \ python3 -m pip install --upgrade pip ; \ pushd models/llama ; \ - python3 -m pip install -r requirements-cuda.txt ; \ + python3 -m pip install -r requirements.txt ; \ popd ; \ python3 -m pip install /ort-artifact/*.whl ; \ + python3 -m pip uninstall -y torch ; \ python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-fp16 --precision fp16 --execution_provider cuda --input /meta-llama2 --small_gpu ;\ popd ; \ " displayName: 'Run Llama2 to Onnx F16 and parity Test' workingDirectory: $(Build.SourcesDirectory) + +- stage: Whisper_ONNX + dependsOn: + - Build_Onnxruntime_Cuda + jobs: + - job: Whisper_ONNX + variables: + skipComponentGovernanceDetection: true + workspace: + clean: all + pool: Onnxruntime-Linux-A10-24G + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: templates/flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download Onnxruntime Artifact' + ArtifactName: 'drop-ort-linux-gpu' + TargetPath: '$(Build.BinariesDirectory)/ort-artifact/' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu + Context: tools/ci_build/github/linux/docker/ + ScriptName: tools/ci_build/get_docker_image.py + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" + Repository: onnxruntimepackagestest + UpdateDepsTxt: false + + - task: DownloadPackage@1 + # The model data in artifact is downloaded from openai/whisper-large-v3 in huggingface model hub + # In order to save size, removed .git directory and pickled files, and keep the safetensors model files + displayName: 'Download Whisper Model' + inputs: + packageType: upack + feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' + version: 1.0.0 + definition: 'b583ce7c-1a8f-4099-ae28-5d5f56c478b1' + downloadPath: $(Agent.TempDirectory)/whisper_large_v3 + + - script: | + docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ + -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ + -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \ + onnxruntimepackagestest \ + bash -c ' + set -ex; \ + pushd /workspace/onnxruntime/python/tools/transformers/ ; \ + python3 -m pip install --upgrade pip ; \ + pushd models/whisper ; \ + python3 -m pip install -r requirements.txt ; \ + popd ; \ + python3 -m pip install /ort-artifact/*.whl ; \ + python3 -m pip uninstall -y torch ; \ + python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \ + python3 -m models.whisper.convert_to_onnx -m /whisper_large_v3 --output whisperlargev3 --use_external_data_format ; \ + popd ; \ + ' + displayName: 'Convert Whisper Model' + workingDirectory: $(Build.SourcesDirectory) + + - script: | + docker run --rm --gpus all -v $(Build.SourcesDirectory):/workspace \ + -v $(Build.BinariesDirectory)/ort-artifact/:/ort-artifact \ + -v $(Agent.TempDirectory)/whisper_large_v3:/whisper_large_v3 \ + onnxruntimepackagestest \ + bash -c ' + set -ex; \ + pushd /workspace/onnxruntime/python/tools/transformers/ ; \ + python3 -m pip install --upgrade pip ; \ + pushd models/whisper ; \ + python3 -m pip install -r requirements.txt ; \ + popd ; \ + python3 -m pip install /ort-artifact/*.whl ; \ + python3 -m pip uninstall -y torch ; \ + python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118 ; \ + ls whisperlargev3; \ + python3 -m models.whisper.benchmark \ + --benchmark-type ort \ + --audio-path models/whisper/test/1272-141231-0002.mp3 \ + --model-name openai/whisper-large-v3 \ + --ort-model-path /workspace/onnxruntime/python/tools/transformers/whisperlargev3/whisper_large_v3_beamsearch.onnx \ + --precision fp32 \ + --device cuda > ort_output.txt ; \ + cat ort_output.txt ; \ + diff ort_output.txt /workspace/onnxruntime/python/tools/transformers/models/whisper/test/whisper_ort_output.txt && exit 0 || exit 1 + popd ; \ + ' + displayName: 'Test Whisper ONNX Model' + workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 9b9dc9ecae822..c9038afc0954c 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -16,15 +16,18 @@ ENV DEBIAN_FRONTEND=noninteractive ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH} RUN apt-get update &&\ - apt-get install -y git bash wget + apt-get install -y git bash wget diffutils # Install python3 RUN apt-get install -y --no-install-recommends \ python3 \ python3-pip \ python3-dev \ - python3-wheel - + python3-wheel + +# Install ffmpeg, which couldn't be installed in UBI8 +# https://stackoverflow.com/questions/73597789/how-to-install-ffmpeg-on-ubi-docker-images +RUN apt-get install -y --no-install-recommends ffmpeg RUN pip install --upgrade pip From 430a086f22684ad0020819dc3e7712f36fe9f016 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Sun, 25 Feb 2024 08:50:45 -0800 Subject: [PATCH 49/68] fix memory mapping on Windows (#19623) ### Description Windows memory map casts mapped_offset to DWORD directly. It will be truncated if it is larger than 2^32-1. We need to set high dwFileOffsetHigh for this case. ### Motivation and Context The bug was found from #19450 --- onnxruntime/core/platform/windows/env.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 0eb34cbfbc9eb..983cc6089bb4c 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -459,8 +459,8 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path, void* const mapped_base = MapViewOfFile(file_mapping_handle.get(), FILE_MAP_READ, - 0, - static_cast(mapped_offset), + static_cast((mapped_offset >> 32) & 0xFFFFFFFF), + static_cast(mapped_offset & 0xFFFFFFFF), mapped_length); GSL_SUPPRESS(r.11) mapped_memory = From a9568935a52b3d51ec802a4ab89ab3852129fc1e Mon Sep 17 00:00:00 2001 From: Sumit Agarwal Date: Mon, 26 Feb 2024 11:35:13 -0800 Subject: [PATCH 50/68] [DML EP] Enable DML Graph Serialization (#19505) ### Description This PR adds a feature to serialize all DML EP partitions into DML currency individually for a given a model. This feature can be dynamically turned on by using DML EP option `ep.dml.enable_graph_serialization`. ### Motivation and Context - Why is this change required? What problem does it solve? Useful when user want to capture the DML EP specific partition into DML currency to mitigate the dependency on the framework. --- .../inc/IWinmlExecutionProvider.h | 7 +- .../DmlExecutionProvider/src/ApiTraits.cpp | 570 +++++++ .../src/DmlGraphDeserialization.cpp | 554 +++++++ .../src/DmlGraphFusionHelper.cpp | 247 ++- .../src/DmlGraphFusionHelper.h | 19 +- .../src/DmlGraphFusionTransformer.cpp | 41 +- .../src/DmlGraphFusionTransformer.h | 4 +- .../src/DmlGraphSerialization.cpp | 580 ++++++++ .../src/DmlRuntimeFusedGraphKernel.cpp | 30 +- .../src/External/DirectMLHelpers/ApiTraits.h | 453 +++++- .../External/DirectMLHelpers/DirectMLSchema.h | 112 +- .../DirectMLHelpers/DmlGraphDesc_generated.h | 788 ++++++++++ .../DirectMLHelpers/DmlGraphDeserialization.h | 14 + .../DirectMLHelpers/DmlGraphSerialization.h | 8 + .../DirectMLHelpers/DmlSerializedGraphDesc.h | 73 + .../DirectMLHelpers/GeneratedSchemaHelpers.h | 92 +- .../DirectMLHelpers/GeneratedSchemaTypes.h | 32 +- .../OperatorFieldTypes_generated.h | 1318 +++++++++++++++++ .../External/DirectMLHelpers/SchemaHelpers.h | 54 +- .../src/GraphDescBuilder.cpp | 404 ++--- .../src/GraphDescBuilder.h | 21 +- .../src/MLOperatorAuthorImpl.cpp | 30 +- .../src/Operators/DmlOperator.cpp | 4 +- .../src/Operators/DmlOperatorAttention.cpp | 2 +- .../src/Operators/DmlOperatorBiasAdd.cpp | 2 +- .../Operators/DmlOperatorBiasSplitGelu.cpp | 2 +- .../DmlOperatorEmbedLayerNormalization.cpp | 2 +- .../src/Operators/DmlOperatorGroupNorm.cpp | 2 +- .../DmlOperatorLayerNormalization.cpp | 2 +- .../Operators/DmlOperatorQLinearConcat.cpp | 2 +- .../Operators/DmlOperatorQLinearSigmoid.cpp | 2 +- .../src/Operators/DmlOperatorQuickGelu.cpp | 2 +- .../Operators/DmlOperatorRotaryEmbedding.cpp | 2 +- .../DmlOperatorSkipLayerNormalization.cpp | 2 +- .../dml/DmlExecutionProvider/src/Utility.h | 141 ++ .../dml/DmlExecutionProvider/src/precomp.h | 7 + .../MLOperatorAuthorPrivate.h | 11 +- .../dml/dml_session_options_config_keys.h | 1 + onnxruntime/core/session/inference_session.cc | 9 +- .../test/perftest/command_args_parser.cc | 1 + onnxruntime/test/perftest/ort_test_session.cc | 10 + 41 files changed, 5203 insertions(+), 454 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h index f29cc3afc3cda..88e3dd487d427 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/inc/IWinmlExecutionProvider.h @@ -80,15 +80,10 @@ namespace Windows::AI::MachineLearning::Adapter }; // This is the counterpart to the MLOperatorGraphDesc ABI struct which owns its memory and uses containers. - // Either nodesAsOperatorDesc or nodesAsIDMLOperator can have non-zero size. struct DmlGraphNodeCreateInfo { uint32_t nodeCount = 0; - std::vector> nodesAsOperatorDesc; - - // TODO (jeffbloo): Remove this - std::vector> nodesAsIDMLOperator; - + std::vector> nodes; std::vector inputEdges; std::vector outputEdges; std::vector intermediateEdges; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp new file mode 100644 index 0000000000000..bf9800458102b --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ApiTraits.cpp @@ -0,0 +1,570 @@ +//--------------------------------------------------------------------------- +// Copyright (c) Microsoft Corporation. All rights reserved. +// +// This file is automatically generated. Please do not edit it directly. +// To modify this file, edit the schema: dml/Tools/DirectMLSchema.json +// And run this script to regenerate: dml/Tools/GenerateSchema.ps1 +// +// #dml-new-operator-location +//--------------------------------------------------------------------------- + +#pragma once + +#include "precomp.h" + +template +T ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ +#ifndef WAI_BUILD_LINUX + // Clang will instantiate this template even if it isn't used, + // so this static_assert will always fire and break the build. + static_assert(false, "Not implemented for this type"); +#endif +} + +template <> +DML_TENSOR_DATA_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_TENSOR_DATA_TYPE_UNKNOWN", DML_TENSOR_DATA_TYPE_UNKNOWN}, + {"DML_TENSOR_DATA_TYPE_FLOAT32", DML_TENSOR_DATA_TYPE_FLOAT32}, + {"DML_TENSOR_DATA_TYPE_FLOAT16", DML_TENSOR_DATA_TYPE_FLOAT16}, + {"DML_TENSOR_DATA_TYPE_UINT32", DML_TENSOR_DATA_TYPE_UINT32}, + {"DML_TENSOR_DATA_TYPE_UINT16", DML_TENSOR_DATA_TYPE_UINT16}, + {"DML_TENSOR_DATA_TYPE_UINT8", DML_TENSOR_DATA_TYPE_UINT8}, + {"DML_TENSOR_DATA_TYPE_INT32", DML_TENSOR_DATA_TYPE_INT32}, + {"DML_TENSOR_DATA_TYPE_INT16", DML_TENSOR_DATA_TYPE_INT16}, + {"DML_TENSOR_DATA_TYPE_INT8", DML_TENSOR_DATA_TYPE_INT8}, + {"DML_TENSOR_DATA_TYPE_FLOAT64", DML_TENSOR_DATA_TYPE_FLOAT64}, + {"DML_TENSOR_DATA_TYPE_UINT64", DML_TENSOR_DATA_TYPE_UINT64}, + {"DML_TENSOR_DATA_TYPE_INT64", DML_TENSOR_DATA_TYPE_INT64}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_TENSOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_TENSOR_TYPE_INVALID", DML_TENSOR_TYPE_INVALID}, + {"DML_TENSOR_TYPE_BUFFER", DML_TENSOR_TYPE_BUFFER}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_OPERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_OPERATOR_INVALID", DML_OPERATOR_INVALID}, + {"DML_OPERATOR_ELEMENT_WISE_IDENTITY", DML_OPERATOR_ELEMENT_WISE_IDENTITY}, + {"DML_OPERATOR_ELEMENT_WISE_ABS", DML_OPERATOR_ELEMENT_WISE_ABS}, + {"DML_OPERATOR_ELEMENT_WISE_ACOS", DML_OPERATOR_ELEMENT_WISE_ACOS}, + {"DML_OPERATOR_ELEMENT_WISE_ADD", DML_OPERATOR_ELEMENT_WISE_ADD}, + {"DML_OPERATOR_ELEMENT_WISE_ASIN", DML_OPERATOR_ELEMENT_WISE_ASIN}, + {"DML_OPERATOR_ELEMENT_WISE_ATAN", DML_OPERATOR_ELEMENT_WISE_ATAN}, + {"DML_OPERATOR_ELEMENT_WISE_CEIL", DML_OPERATOR_ELEMENT_WISE_CEIL}, + {"DML_OPERATOR_ELEMENT_WISE_CLIP", DML_OPERATOR_ELEMENT_WISE_CLIP}, + {"DML_OPERATOR_ELEMENT_WISE_COS", DML_OPERATOR_ELEMENT_WISE_COS}, + {"DML_OPERATOR_ELEMENT_WISE_DIVIDE", DML_OPERATOR_ELEMENT_WISE_DIVIDE}, + {"DML_OPERATOR_ELEMENT_WISE_EXP", DML_OPERATOR_ELEMENT_WISE_EXP}, + {"DML_OPERATOR_ELEMENT_WISE_FLOOR", DML_OPERATOR_ELEMENT_WISE_FLOOR}, + {"DML_OPERATOR_ELEMENT_WISE_LOG", DML_OPERATOR_ELEMENT_WISE_LOG}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND", DML_OPERATOR_ELEMENT_WISE_LOGICAL_AND}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS", DML_OPERATOR_ELEMENT_WISE_LOGICAL_EQUALS}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_GREATER_THAN_OR_EQUAL}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL", DML_OPERATOR_ELEMENT_WISE_LOGICAL_LESS_THAN_OR_EQUAL}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT", DML_OPERATOR_ELEMENT_WISE_LOGICAL_NOT}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_OR}, + {"DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR", DML_OPERATOR_ELEMENT_WISE_LOGICAL_XOR}, + {"DML_OPERATOR_ELEMENT_WISE_MAX", DML_OPERATOR_ELEMENT_WISE_MAX}, + {"DML_OPERATOR_ELEMENT_WISE_MEAN", DML_OPERATOR_ELEMENT_WISE_MEAN}, + {"DML_OPERATOR_ELEMENT_WISE_MIN", DML_OPERATOR_ELEMENT_WISE_MIN}, + {"DML_OPERATOR_ELEMENT_WISE_MULTIPLY", DML_OPERATOR_ELEMENT_WISE_MULTIPLY}, + {"DML_OPERATOR_ELEMENT_WISE_POW", DML_OPERATOR_ELEMENT_WISE_POW}, + {"DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW", DML_OPERATOR_ELEMENT_WISE_CONSTANT_POW}, + {"DML_OPERATOR_ELEMENT_WISE_RECIP", DML_OPERATOR_ELEMENT_WISE_RECIP}, + {"DML_OPERATOR_ELEMENT_WISE_SIN", DML_OPERATOR_ELEMENT_WISE_SIN}, + {"DML_OPERATOR_ELEMENT_WISE_SQRT", DML_OPERATOR_ELEMENT_WISE_SQRT}, + {"DML_OPERATOR_ELEMENT_WISE_SUBTRACT", DML_OPERATOR_ELEMENT_WISE_SUBTRACT}, + {"DML_OPERATOR_ELEMENT_WISE_TAN", DML_OPERATOR_ELEMENT_WISE_TAN}, + {"DML_OPERATOR_ELEMENT_WISE_THRESHOLD", DML_OPERATOR_ELEMENT_WISE_THRESHOLD}, + {"DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR}, + {"DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR", DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR}, + {"DML_OPERATOR_ACTIVATION_ELU", DML_OPERATOR_ACTIVATION_ELU}, + {"DML_OPERATOR_ACTIVATION_CELU", DML_OPERATOR_ACTIVATION_CELU}, + {"DML_OPERATOR_ACTIVATION_HARDMAX", DML_OPERATOR_ACTIVATION_HARDMAX}, + {"DML_OPERATOR_ACTIVATION_HARDMAX1", DML_OPERATOR_ACTIVATION_HARDMAX1}, + {"DML_OPERATOR_ACTIVATION_HARD_SIGMOID", DML_OPERATOR_ACTIVATION_HARD_SIGMOID}, + {"DML_OPERATOR_ACTIVATION_IDENTITY", DML_OPERATOR_ACTIVATION_IDENTITY}, + {"DML_OPERATOR_ACTIVATION_LEAKY_RELU", DML_OPERATOR_ACTIVATION_LEAKY_RELU}, + {"DML_OPERATOR_ACTIVATION_LINEAR", DML_OPERATOR_ACTIVATION_LINEAR}, + {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX}, + {"DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1", DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1}, + {"DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU", DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU}, + {"DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS", DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS}, + {"DML_OPERATOR_ACTIVATION_RELU", DML_OPERATOR_ACTIVATION_RELU}, + {"DML_OPERATOR_ACTIVATION_SCALED_ELU", DML_OPERATOR_ACTIVATION_SCALED_ELU}, + {"DML_OPERATOR_ACTIVATION_SCALED_TANH", DML_OPERATOR_ACTIVATION_SCALED_TANH}, + {"DML_OPERATOR_ACTIVATION_SIGMOID", DML_OPERATOR_ACTIVATION_SIGMOID}, + {"DML_OPERATOR_ACTIVATION_SOFTMAX", DML_OPERATOR_ACTIVATION_SOFTMAX}, + {"DML_OPERATOR_ACTIVATION_SOFTMAX1", DML_OPERATOR_ACTIVATION_SOFTMAX1}, + {"DML_OPERATOR_ACTIVATION_SOFTPLUS", DML_OPERATOR_ACTIVATION_SOFTPLUS}, + {"DML_OPERATOR_ACTIVATION_SOFTSIGN", DML_OPERATOR_ACTIVATION_SOFTSIGN}, + {"DML_OPERATOR_ACTIVATION_TANH", DML_OPERATOR_ACTIVATION_TANH}, + {"DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU", DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU}, + {"DML_OPERATOR_CONVOLUTION", DML_OPERATOR_CONVOLUTION}, + {"DML_OPERATOR_GEMM", DML_OPERATOR_GEMM}, + {"DML_OPERATOR_REDUCE", DML_OPERATOR_REDUCE}, + {"DML_OPERATOR_AVERAGE_POOLING", DML_OPERATOR_AVERAGE_POOLING}, + {"DML_OPERATOR_AVERAGE_POOLING1", DML_OPERATOR_AVERAGE_POOLING1}, + {"DML_OPERATOR_LP_POOLING", DML_OPERATOR_LP_POOLING}, + {"DML_OPERATOR_LP_POOLING1", DML_OPERATOR_LP_POOLING1}, + {"DML_OPERATOR_MAX_POOLING", DML_OPERATOR_MAX_POOLING}, + {"DML_OPERATOR_ROI_POOLING", DML_OPERATOR_ROI_POOLING}, + {"DML_OPERATOR_SLICE", DML_OPERATOR_SLICE}, + {"DML_OPERATOR_CAST", DML_OPERATOR_CAST}, + {"DML_OPERATOR_SPLIT", DML_OPERATOR_SPLIT}, + {"DML_OPERATOR_JOIN", DML_OPERATOR_JOIN}, + {"DML_OPERATOR_PADDING", DML_OPERATOR_PADDING}, + {"DML_OPERATOR_PADDING1", DML_OPERATOR_PADDING1}, + {"DML_OPERATOR_VALUE_SCALE_2D", DML_OPERATOR_VALUE_SCALE_2D}, + {"DML_OPERATOR_UPSAMPLE_2D", DML_OPERATOR_UPSAMPLE_2D}, + {"DML_OPERATOR_GATHER", DML_OPERATOR_GATHER}, + {"DML_OPERATOR_SPACE_TO_DEPTH", DML_OPERATOR_SPACE_TO_DEPTH}, + {"DML_OPERATOR_DEPTH_TO_SPACE", DML_OPERATOR_DEPTH_TO_SPACE}, + {"DML_OPERATOR_TILE", DML_OPERATOR_TILE}, + {"DML_OPERATOR_TOP_K", DML_OPERATOR_TOP_K}, + {"DML_OPERATOR_BATCH_NORMALIZATION", DML_OPERATOR_BATCH_NORMALIZATION}, + {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING}, + {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION}, + {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION}, + {"DML_OPERATOR_LP_NORMALIZATION", DML_OPERATOR_LP_NORMALIZATION}, + {"DML_OPERATOR_RNN", DML_OPERATOR_RNN}, + {"DML_OPERATOR_LSTM", DML_OPERATOR_LSTM}, + {"DML_OPERATOR_GRU", DML_OPERATOR_GRU}, + {"DML_OPERATOR_ELEMENT_WISE_SIGN", DML_OPERATOR_ELEMENT_WISE_SIGN}, + {"DML_OPERATOR_ELEMENT_WISE_IS_NAN", DML_OPERATOR_ELEMENT_WISE_IS_NAN}, + {"DML_OPERATOR_ELEMENT_WISE_ERF", DML_OPERATOR_ELEMENT_WISE_ERF}, + {"DML_OPERATOR_ELEMENT_WISE_SINH", DML_OPERATOR_ELEMENT_WISE_SINH}, + {"DML_OPERATOR_ELEMENT_WISE_COSH", DML_OPERATOR_ELEMENT_WISE_COSH}, + {"DML_OPERATOR_ELEMENT_WISE_TANH", DML_OPERATOR_ELEMENT_WISE_TANH}, + {"DML_OPERATOR_ELEMENT_WISE_ASINH", DML_OPERATOR_ELEMENT_WISE_ASINH}, + {"DML_OPERATOR_ELEMENT_WISE_ACOSH", DML_OPERATOR_ELEMENT_WISE_ACOSH}, + {"DML_OPERATOR_ELEMENT_WISE_ATANH", DML_OPERATOR_ELEMENT_WISE_ATANH}, + {"DML_OPERATOR_ELEMENT_WISE_IF", DML_OPERATOR_ELEMENT_WISE_IF}, + {"DML_OPERATOR_ELEMENT_WISE_ADD1", DML_OPERATOR_ELEMENT_WISE_ADD1}, + {"DML_OPERATOR_ACTIVATION_SHRINK", DML_OPERATOR_ACTIVATION_SHRINK}, + {"DML_OPERATOR_MAX_POOLING1", DML_OPERATOR_MAX_POOLING1}, + {"DML_OPERATOR_MAX_UNPOOLING", DML_OPERATOR_MAX_UNPOOLING}, + {"DML_OPERATOR_DIAGONAL_MATRIX", DML_OPERATOR_DIAGONAL_MATRIX}, + {"DML_OPERATOR_SCATTER", DML_OPERATOR_SCATTER}, + {"DML_OPERATOR_ONE_HOT", DML_OPERATOR_ONE_HOT}, + {"DML_OPERATOR_RESAMPLE", DML_OPERATOR_RESAMPLE}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_LEFT}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT", DML_OPERATOR_ELEMENT_WISE_BIT_SHIFT_RIGHT}, + {"DML_OPERATOR_ELEMENT_WISE_ROUND", DML_OPERATOR_ELEMENT_WISE_ROUND}, + {"DML_OPERATOR_ELEMENT_WISE_IS_INFINITY", DML_OPERATOR_ELEMENT_WISE_IS_INFINITY}, + {"DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE", DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE}, + {"DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR", DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR}, + {"DML_OPERATOR_FILL_VALUE_SEQUENCE", DML_OPERATOR_FILL_VALUE_SEQUENCE}, + {"DML_OPERATOR_FILL_VALUE_CONSTANT", DML_OPERATOR_FILL_VALUE_CONSTANT}, + {"DML_OPERATOR_CUMULATIVE_SUMMATION", DML_OPERATOR_CUMULATIVE_SUMMATION}, + {"DML_OPERATOR_REVERSE_SUBSEQUENCES", DML_OPERATOR_REVERSE_SUBSEQUENCES}, + {"DML_OPERATOR_GATHER_ELEMENTS", DML_OPERATOR_GATHER_ELEMENTS}, + {"DML_OPERATOR_GATHER_ND", DML_OPERATOR_GATHER_ND}, + {"DML_OPERATOR_SCATTER_ND", DML_OPERATOR_SCATTER_ND}, + {"DML_OPERATOR_MAX_POOLING2", DML_OPERATOR_MAX_POOLING2}, + {"DML_OPERATOR_SLICE1", DML_OPERATOR_SLICE1}, + {"DML_OPERATOR_TOP_K1", DML_OPERATOR_TOP_K1}, + {"DML_OPERATOR_DEPTH_TO_SPACE1", DML_OPERATOR_DEPTH_TO_SPACE1}, + {"DML_OPERATOR_SPACE_TO_DEPTH1", DML_OPERATOR_SPACE_TO_DEPTH1}, + {"DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1", DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION1}, + {"DML_OPERATOR_RESAMPLE1", DML_OPERATOR_RESAMPLE1}, + {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER}, + {"DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY", DML_OPERATOR_QUANTIZED_LINEAR_MATRIX_MULTIPLY}, + {"DML_OPERATOR_CONVOLUTION_INTEGER", DML_OPERATOR_CONVOLUTION_INTEGER}, + {"DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION", DML_OPERATOR_QUANTIZED_LINEAR_CONVOLUTION}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_AND", DML_OPERATOR_ELEMENT_WISE_BIT_AND}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_OR", DML_OPERATOR_ELEMENT_WISE_BIT_OR}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_XOR", DML_OPERATOR_ELEMENT_WISE_BIT_XOR}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_NOT", DML_OPERATOR_ELEMENT_WISE_BIT_NOT}, + {"DML_OPERATOR_ELEMENT_WISE_BIT_COUNT", DML_OPERATOR_ELEMENT_WISE_BIT_COUNT}, + {"DML_OPERATOR_ACTIVATION_RELU_GRAD", DML_OPERATOR_ACTIVATION_RELU_GRAD}, + {"DML_OPERATOR_AVERAGE_POOLING_GRAD", DML_OPERATOR_AVERAGE_POOLING_GRAD}, + {"DML_OPERATOR_MAX_POOLING_GRAD", DML_OPERATOR_MAX_POOLING_GRAD}, + {"DML_OPERATOR_RANDOM_GENERATOR", DML_OPERATOR_RANDOM_GENERATOR}, + {"DML_OPERATOR_NONZERO_COORDINATES", DML_OPERATOR_NONZERO_COORDINATES}, + {"DML_OPERATOR_RESAMPLE_GRAD", DML_OPERATOR_RESAMPLE_GRAD}, + {"DML_OPERATOR_SLICE_GRAD", DML_OPERATOR_SLICE_GRAD}, + {"DML_OPERATOR_ADAM_OPTIMIZER", DML_OPERATOR_ADAM_OPTIMIZER}, + {"DML_OPERATOR_ARGMIN", DML_OPERATOR_ARGMIN}, + {"DML_OPERATOR_ARGMAX", DML_OPERATOR_ARGMAX}, + {"DML_OPERATOR_ROI_ALIGN", DML_OPERATOR_ROI_ALIGN}, + {"DML_OPERATOR_GATHER_ND1", DML_OPERATOR_GATHER_ND1}, + {"DML_OPERATOR_ELEMENT_WISE_ATAN_YX", DML_OPERATOR_ELEMENT_WISE_ATAN_YX}, + {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD}, + {"DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE", DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE}, + {"DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD", DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD}, + {"DML_OPERATOR_CUMULATIVE_PRODUCT", DML_OPERATOR_CUMULATIVE_PRODUCT}, + {"DML_OPERATOR_BATCH_NORMALIZATION_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_GRAD}, + {"DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD", DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD}, + {"DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD", DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD}, + {"DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR", DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR}, + {"DML_OPERATOR_ROI_ALIGN1", DML_OPERATOR_ROI_ALIGN1}, + {"DML_OPERATOR_ELEMENT_WISE_CLIP1", DML_OPERATOR_ELEMENT_WISE_CLIP1}, + {"DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1", DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1}, + {"DML_OPERATOR_ELEMENT_WISE_NEGATE", DML_OPERATOR_ELEMENT_WISE_NEGATE}, + {"DML_OPERATOR_ACTIVATION_GELU", DML_OPERATOR_ACTIVATION_GELU}, + {"DML_OPERATOR_ACTIVATION_SWISH", DML_OPERATOR_ACTIVATION_SWISH}, + {"DML_OPERATOR_ACTIVATION_HARD_SWISH", DML_OPERATOR_ACTIVATION_HARD_SWISH}, + {"DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2}, + {"DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1}, + {"DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1}, + {"DML_OPERATOR_MULTIHEAD_ATTENTION", DML_OPERATOR_MULTIHEAD_ATTENTION}, + {"DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING}, + {"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_BINDING_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_BINDING_TYPE_NONE", DML_BINDING_TYPE_NONE}, + {"DML_BINDING_TYPE_BUFFER", DML_BINDING_TYPE_BUFFER}, + {"DML_BINDING_TYPE_BUFFER_ARRAY", DML_BINDING_TYPE_BUFFER_ARRAY}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_REDUCE_FUNCTION ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_REDUCE_FUNCTION_ARGMAX", DML_REDUCE_FUNCTION_ARGMAX}, + {"DML_REDUCE_FUNCTION_ARGMIN", DML_REDUCE_FUNCTION_ARGMIN}, + {"DML_REDUCE_FUNCTION_AVERAGE", DML_REDUCE_FUNCTION_AVERAGE}, + {"DML_REDUCE_FUNCTION_L1", DML_REDUCE_FUNCTION_L1}, + {"DML_REDUCE_FUNCTION_L2", DML_REDUCE_FUNCTION_L2}, + {"DML_REDUCE_FUNCTION_LOG_SUM", DML_REDUCE_FUNCTION_LOG_SUM}, + {"DML_REDUCE_FUNCTION_LOG_SUM_EXP", DML_REDUCE_FUNCTION_LOG_SUM_EXP}, + {"DML_REDUCE_FUNCTION_MAX", DML_REDUCE_FUNCTION_MAX}, + {"DML_REDUCE_FUNCTION_MIN", DML_REDUCE_FUNCTION_MIN}, + {"DML_REDUCE_FUNCTION_MULTIPLY", DML_REDUCE_FUNCTION_MULTIPLY}, + {"DML_REDUCE_FUNCTION_SUM", DML_REDUCE_FUNCTION_SUM}, + {"DML_REDUCE_FUNCTION_SUM_SQUARE", DML_REDUCE_FUNCTION_SUM_SQUARE}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + +template <> +DML_MATRIX_TRANSFORM ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_MATRIX_TRANSFORM_NONE", DML_MATRIX_TRANSFORM_NONE}, + {"DML_MATRIX_TRANSFORM_TRANSPOSE", DML_MATRIX_TRANSFORM_TRANSPOSE}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_CONVOLUTION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_CONVOLUTION_MODE_CONVOLUTION", DML_CONVOLUTION_MODE_CONVOLUTION}, + {"DML_CONVOLUTION_MODE_CROSS_CORRELATION", DML_CONVOLUTION_MODE_CROSS_CORRELATION}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_CONVOLUTION_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_CONVOLUTION_DIRECTION_FORWARD", DML_CONVOLUTION_DIRECTION_FORWARD}, + {"DML_CONVOLUTION_DIRECTION_BACKWARD", DML_CONVOLUTION_DIRECTION_BACKWARD}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + +template <> +DML_PADDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_PADDING_MODE_CONSTANT", DML_PADDING_MODE_CONSTANT}, + {"DML_PADDING_MODE_EDGE", DML_PADDING_MODE_EDGE}, + {"DML_PADDING_MODE_REFLECTION", DML_PADDING_MODE_REFLECTION}, + {"DML_PADDING_MODE_SYMMETRIC", DML_PADDING_MODE_SYMMETRIC}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_INTERPOLATION_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR", DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR}, + {"DML_INTERPOLATION_MODE_LINEAR", DML_INTERPOLATION_MODE_LINEAR}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_RECURRENT_NETWORK_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_RECURRENT_NETWORK_DIRECTION_FORWARD", DML_RECURRENT_NETWORK_DIRECTION_FORWARD}, + {"DML_RECURRENT_NETWORK_DIRECTION_BACKWARD", DML_RECURRENT_NETWORK_DIRECTION_BACKWARD}, + {"DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL", DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_FEATURE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT", DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT}, + {"DML_FEATURE_FEATURE_LEVELS", DML_FEATURE_FEATURE_LEVELS}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_FEATURE_LEVEL ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_FEATURE_LEVEL_1_0", DML_FEATURE_LEVEL_1_0}, + {"DML_FEATURE_LEVEL_2_0", DML_FEATURE_LEVEL_2_0}, + {"DML_FEATURE_LEVEL_2_1", DML_FEATURE_LEVEL_2_1}, + {"DML_FEATURE_LEVEL_3_0", DML_FEATURE_LEVEL_3_0}, + {"DML_FEATURE_LEVEL_3_1", DML_FEATURE_LEVEL_3_1}, + {"DML_FEATURE_LEVEL_4_0", DML_FEATURE_LEVEL_4_0}, + {"DML_FEATURE_LEVEL_4_1", DML_FEATURE_LEVEL_4_1}, + {"DML_FEATURE_LEVEL_5_0", DML_FEATURE_LEVEL_5_0}, + {"DML_FEATURE_LEVEL_5_1", DML_FEATURE_LEVEL_5_1}, + {"DML_FEATURE_LEVEL_5_2", DML_FEATURE_LEVEL_5_2}, + {"DML_FEATURE_LEVEL_6_0", DML_FEATURE_LEVEL_6_0}, + {"DML_FEATURE_LEVEL_6_1", DML_FEATURE_LEVEL_6_1}, + {"DML_FEATURE_LEVEL_6_2", DML_FEATURE_LEVEL_6_2}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_IS_INFINITY_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_IS_INFINITY_MODE_EITHER", DML_IS_INFINITY_MODE_EITHER}, + {"DML_IS_INFINITY_MODE_POSITIVE", DML_IS_INFINITY_MODE_POSITIVE}, + {"DML_IS_INFINITY_MODE_NEGATIVE", DML_IS_INFINITY_MODE_NEGATIVE}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_DEPTH_SPACE_ORDER ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW", DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW}, + {"DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH", DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_AXIS_DIRECTION ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_AXIS_DIRECTION_INCREASING", DML_AXIS_DIRECTION_INCREASING}, + {"DML_AXIS_DIRECTION_DECREASING", DML_AXIS_DIRECTION_DECREASING}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_ROUNDING_MODE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN", DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN}, + {"DML_ROUNDING_MODE_TOWARD_ZERO", DML_ROUNDING_MODE_TOWARD_ZERO}, + {"DML_ROUNDING_MODE_TOWARD_INFINITY", DML_ROUNDING_MODE_TOWARD_INFINITY}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_RANDOM_GENERATOR_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10", DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + + +template <> +DML_MULTIHEAD_ATTENTION_MASK_TYPE ApiTraits::StringifyHelpers::FromString(std::string_view value) +{ + constexpr StringUtil::NameAndIndex mapping[] = + { + {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE", DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE}, + {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH}, + {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START}, + {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END", DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END}, + {"DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN", DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN}, + }; + auto index = StringUtil::MapToIndex(value, mapping); + if (!index) + { + assert(false); + return static_cast(0); + } + return static_cast(*index); +} + diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp new file mode 100644 index 0000000000000..7d8ed17e7d925 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphDeserialization.cpp @@ -0,0 +1,554 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. + +#pragma once +#include "precomp.h" + +OperatorFieldVariant CreateAttribute( + const DML_SCHEMA_FIELD* schemaField, + const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc); + +OperatorFieldVariant CreateActivation( + const dml::ir::operatorFieldTypes::Activation* activationDesc) +{ + DML_OPERATOR_TYPE activationOperatorType = ApiTraits::StringifyHelpers::FromString(activationDesc->type()->c_str()); + const DML_OPERATOR_SCHEMA& activationSchema = SchemaHelpers::GetSchema(activationOperatorType); + std::vector activationOperatorFields(activationSchema.FieldCount); + uint32_t attributeIndex = 0; + + for (uint32_t fieldIndex = 0; fieldIndex < activationSchema.FieldCount; fieldIndex++) + { + const DML_SCHEMA_FIELD* schemaField = &activationSchema.Fields[fieldIndex]; + OperatorFieldVariant field; + switch (schemaField->Kind) + { + case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR: + case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR: + { + if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC) + { + field = OperatorFieldTypes::TensorDesc(); + } + else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY) + { + field = OperatorFieldTypes::TensorDescArray(); + } + break; + } + case DML_SCHEMA_FIELD_KIND_ATTRIBUTE: + { + const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = + attributeIndex >= activationDesc->attributes()->size() ? + nullptr : + activationDesc->attributes()->Get(attributeIndex++); + field = CreateAttribute(schemaField, attributeDesc); + break; + } + } + + activationOperatorFields[fieldIndex] = OperatorField(schemaField, std::move(field)); + } + + return AbstractOperatorDesc(&activationSchema, std::move(activationOperatorFields)); +} + +OperatorFieldVariant CreateActivations( + const dml::ir::operatorFieldTypes::ActivationArray* activationDescs) +{ + std::vector activations; + for (uint32_t index = 0; index < static_cast(activationDescs->data()->size()); index++) + { + OperatorFieldVariant activation = CreateActivation(activationDescs->data()->Get(index)); + activations.push_back(std::get(activation).value()); + } + return activations; +} + +OperatorFieldVariant CreateAttribute( + const DML_SCHEMA_FIELD* schemaField, + const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc) +{ + switch (schemaField->Type) + { + case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC: + { + return attributeDesc != nullptr && attributeDesc->val_as_Activation() != nullptr ? + CreateActivation(attributeDesc->val_as_Activation()) : + OperatorFieldTypes::FusedActivationOperatorDesc(); + } + case DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY: + { + return attributeDesc != nullptr && attributeDesc->val_as_ActivationArray() != nullptr ? + CreateActivations(attributeDesc->val_as_ActivationArray()) : + OperatorFieldTypes::FusedActivationOperatorDescArray(); + } + case DML_SCHEMA_FIELD_TYPE_UINT: + { + OperatorFieldTypes::UInt data; + if (attributeDesc != nullptr) + { + data = attributeDesc->val_as_UInt32()->data(); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_UINT64: + { + OperatorFieldTypes::UInt64 data; + if (attributeDesc != nullptr) + { + data = attributeDesc->val_as_UInt64()->data(); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_INT: + { + OperatorFieldTypes::Int data; + if (attributeDesc != nullptr) + { + data = attributeDesc->val_as_Int32()->data(); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_FLOAT: + { + OperatorFieldTypes::Float data; + if (attributeDesc != nullptr) + { + data = attributeDesc->val_as_Float32()->data(); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY: + { + OperatorFieldTypes::UIntArray data; + if (attributeDesc != nullptr) + { + data.assign(attributeDesc->val_as_UIntArray()->data()->begin(), attributeDesc->val_as_UIntArray()->data()->end()); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_INT_ARRAY: + { + OperatorFieldTypes::IntArray data; + if (attributeDesc != nullptr) + { + data.assign(attributeDesc->val_as_IntArray()->data()->begin(), attributeDesc->val_as_IntArray()->data()->end()); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY: + { + OperatorFieldTypes::FloatArray data; + if (attributeDesc != nullptr) + { + data.assign(attributeDesc->val_as_FloatArray()->data()->begin(), attributeDesc->val_as_FloatArray()->data()->end()); + } + return data; + } + case DML_SCHEMA_FIELD_TYPE_SCALE_BIAS: + { + OperatorFieldTypes::ScaleBias scaleBias; + const dml::ir::operatorFieldTypes::ScaleBias* scaleBiasAttribute = attributeDesc->val_as_ScaleBias(); + if (scaleBiasAttribute != nullptr) + { + scaleBias = {scaleBiasAttribute->scale(), scaleBiasAttribute->bias()}; + } + return scaleBias; + } + case DML_SCHEMA_FIELD_TYPE_SIZE_2D: + { + OperatorFieldTypes::Size2D size2d = {}; + if (attributeDesc != nullptr) + { + size2d.Height = attributeDesc->val_as_Size2D()->height(); + size2d.Width = attributeDesc->val_as_Size2D()->width(); + } + return size2d; + } + case DML_SCHEMA_FIELD_TYPE_SCALAR_UNION: + { + DML_SCALAR_UNION scalarUnion; + if (attributeDesc != nullptr) + { + const dml::ir::operatorFieldTypes::ByteArray* byteArr = attributeDesc->val_as_ScalarUnionData()->data_as_ByteArray(); + std::copy(byteArr->data()->begin(), byteArr->data()->end(), scalarUnion.Bytes); + } + return scalarUnion; + } + case DML_SCHEMA_FIELD_TYPE_BOOL: + { + OperatorFieldTypes::Bool data; + if (attributeDesc != nullptr) + { + data = attributeDesc->val_as_Bool()->data(); + } + return data; + } + default: + { + throw std::invalid_argument("Invalid attribute type."); + } + } +} + +OperatorFieldTypes::TensorDesc CreateBufferTensorDesc( + const dml::ir::DmlBufferTensorDesc* tensorDesc, + const bool isConstantTensor = false) +{ + DmlBufferTensorDesc bufferTensorDesc = {}; + bufferTensorDesc.dataType = ApiTraits::StringifyHelpers::FromString(tensorDesc->dataType()->c_str()); + if (isConstantTensor) + { + bufferTensorDesc.flags = DML_TENSOR_FLAG_OWNED_BY_DML; + } + bufferTensorDesc.sizes.assign(tensorDesc->sizes()->begin(), tensorDesc->sizes()->end()); + if (flatbuffers::IsFieldPresent(tensorDesc, dml::ir::DmlBufferTensorDesc::VT_STRIDES)) + { + bufferTensorDesc.strides.emplace(tensorDesc->strides()->begin(), tensorDesc->strides()->end()); + } + bufferTensorDesc.totalTensorSizeInBytes = tensorDesc->totalTensorSizeInBytes(); + return bufferTensorDesc; +} + +AbstractOperatorDesc CreateAbstractOperatorDesc( + uint32_t nodeIndex, + const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc, + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeInputNames, + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* nodeOutputNames, + const std::unordered_set& constantInputs) +{ + DML_OPERATOR_TYPE type = ApiTraits::StringifyHelpers::FromString(flatbufferOperatorNodeDesc->type()->c_str()); + if (type == DML_OPERATOR_INVALID) + { + throw std::invalid_argument("Graph operator node at index:" + std::to_string(nodeIndex) + + " either has empty or invalid operator type."); + } + const DML_OPERATOR_SCHEMA& schema = SchemaHelpers::GetSchema(type); + std::vector operatorFields(schema.FieldCount); + + auto inputNameItr = nodeInputNames->begin(); + uint32_t inputTensorDescIndex = 0; + + uint32_t outputTensorDescIndex = 0; + auto outputNameItr = nodeOutputNames->begin(); + + uint32_t attributeIndex = 0; + + + for (uint32_t fieldIndex = 0; fieldIndex < schema.FieldCount; fieldIndex++) + { + const DML_SCHEMA_FIELD* schemaField = &schema.Fields[fieldIndex]; + + OperatorFieldVariant field; + switch (schemaField->Kind) + { + case DML_SCHEMA_FIELD_KIND_INPUT_TENSOR: + { + if (inputNameItr == nodeInputNames->end()) + { + throw std::invalid_argument("Missing input names for node at index:" + std::to_string(nodeIndex)); + } + + if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC) + { + const flatbuffers::String* inputName = *inputNameItr; + inputNameItr++; + if (inputName->size() == 0) + { + field = OperatorFieldTypes::TensorDesc(); + break; + } + bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end(); + + if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex) + { + throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + + "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex)); + } + const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++); + field = CreateBufferTensorDesc(tensorDesc, isConstantTensor); + } + else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY) + { + std::vector tensors; + while (inputTensorDescIndex < static_cast(flatbufferOperatorNodeDesc->inputs()->size())) + { + const flatbuffers::String* inputName = *inputNameItr; + inputNameItr++; + bool isConstantTensor = !constantInputs.empty() && constantInputs.find(inputName->c_str()) != constantInputs.end(); + + if (flatbufferOperatorNodeDesc->inputs()->size() <= inputTensorDescIndex) + { + throw std::invalid_argument("Expecting at least " + std::to_string(inputTensorDescIndex + 1) + + "input tensor desc for graph operator node at index:" + std::to_string(nodeIndex)); + } + const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->inputs()->Get(inputTensorDescIndex++); + tensors.push_back(CreateBufferTensorDesc(tensorDesc, isConstantTensor).value()); + } + field = tensors; + } + break; + } + case DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR: + { + if (outputNameItr == nodeOutputNames->end()) + { + throw std::invalid_argument("Missing output names for node at index:" + std::to_string(nodeIndex)); + } + + if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC) + { + const flatbuffers::String* outputName = *outputNameItr; + outputNameItr++; + + if (outputName->size() == 0) + { + field = OperatorFieldTypes::TensorDesc(); + break; + } + + if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex) + { + throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + + "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex)); + } + const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++); + field = CreateBufferTensorDesc(tensorDesc); + } + else if (schemaField->Type == DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY) + { + std::vector tensors; + while (outputTensorDescIndex < static_cast(flatbufferOperatorNodeDesc->outputs()->size())) + { + if (flatbufferOperatorNodeDesc->outputs()->size() <= outputTensorDescIndex) + { + throw std::invalid_argument("Expecting at least " + std::to_string(outputTensorDescIndex + 1) + + "output tensor desc for graph operator node at index:" + std::to_string(nodeIndex)); + } + const dml::ir::DmlBufferTensorDesc* tensorDesc = flatbufferOperatorNodeDesc->outputs()->Get(outputTensorDescIndex++); + tensors.push_back(CreateBufferTensorDesc(tensorDesc).value()); + } + field = tensors; + } + break; + } + case DML_SCHEMA_FIELD_KIND_ATTRIBUTE: + { + if (flatbufferOperatorNodeDesc->attributes()->size() <= attributeIndex) + { + throw std::invalid_argument("Expecting at least " + std::to_string(attributeIndex + 1) + + "attributes for graph operator node at index:" + std::to_string(nodeIndex)); + } + const dml::ir::operatorFieldTypes::AttributeDesc* attributeDesc = + attributeIndex >= flatbufferOperatorNodeDesc->attributes()->size() ? + nullptr : + flatbufferOperatorNodeDesc->attributes()->Get(attributeIndex++); + field = CreateAttribute(schemaField, attributeDesc); + break; + } + } + + operatorFields[fieldIndex] = OperatorField(schemaField, std::move(field)); + } + + return AbstractOperatorDesc(&schema, std::move(operatorFields)); +} + +std::unordered_map ConvertToEdgeNameToIndexMap( + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* list) +{ + std::unordered_map nameToIndexMap; + for (uint32_t index = 0; index < list->size(); index++) + { + const flatbuffers::String* name = list->GetAsString(index); + if (name->size() == 0) + { + continue; + } + nameToIndexMap[name->string_view()] = index; + } + return nameToIndexMap; // NRVO will automatically move it. no need to use std::move +} + +template void PopulateEdges( + const uint32_t nodeIndex, + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>* edgeNames, + const std::unordered_map& edgeNameToIndexMap, + /*out*/ std::vector& edges, + /*out*/ std::vector& intermediateEdges, + /*out*/ std::unordered_map& edgeToOutgoingNodeIndexMap) +{ + for (flatbuffers::uoffset_t edgeIndex = 0; edgeIndex < edgeNames->size(); edgeIndex++) + { + const flatbuffers::String* edgeName = edgeNames->Get(edgeIndex); + if (edgeName->size() == 0) + { + // This must be optional input/output + continue; + } + // edge can be graphInput or graphOutput + if (edgeNameToIndexMap.find(edgeName->string_view()) != edgeNameToIndexMap.end()) + { + EdgeType edge = {}; + edge.Name = edgeName->str(); + + if constexpr (std::is_same_v) + { + edge.GraphInputIndex = edgeNameToIndexMap.at(edgeName->string_view()); + edge.ToNodeIndex = nodeIndex; + edge.ToNodeInputIndex = edgeIndex; + } + else if constexpr (std::is_same_v) + { + edge.GraphOutputIndex = edgeNameToIndexMap.at(edgeName->string_view()); + edge.FromNodeIndex = nodeIndex; + edge.FromNodeOutputIndex = edgeIndex; + edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex}; + } + + edges.push_back(edge); + } + // edge is intermediate edge + else + { + if constexpr (std::is_same_v) + { + if (edgeToOutgoingNodeIndexMap.find(edgeName->string_view()) == edgeToOutgoingNodeIndexMap.end()) + { + throw std::range_error("Neither there is any graph input with name " + edgeName->str() + + "nor there is any node which has " + edgeName->str() + " as one of the output."); + } + auto& intermediateEdgeNodeIndex = edgeToOutgoingNodeIndexMap[edgeName->string_view()]; + DmlIntermediateSerializedGraphEdge intermediateEdge = {}; + intermediateEdge.Name = edgeName->str(); + intermediateEdge.FromNodeIndex = intermediateEdgeNodeIndex.nodeIndex; + intermediateEdge.FromNodeOutputIndex = intermediateEdgeNodeIndex.nodeOutputIndex; + intermediateEdge.ToNodeIndex = nodeIndex; + intermediateEdge.ToNodeInputIndex = edgeIndex; + intermediateEdges.push_back(std::move(intermediateEdge)); + } + else if constexpr (std::is_same_v) + { + edgeToOutgoingNodeIndexMap[edgeName->string_view()] = {nodeIndex, edgeIndex}; + } + } + } +} + +/* +* - Handling of empty optional input/output/attibute for non-constant node: +* input/output +* - and will have an null entry +* but the actual OperatorNodeDesc variant's +* and will not have any entry. +* attribute +* - will have null entry +*/ +DmlSerializedGraphDesc DeserializeDmlGraph( + const uint8_t* flatbufferGraphDescBlob, + /*out*/ std::vector>& rawData) +{ + if (flatbufferGraphDescBlob == nullptr) + { + throw std::invalid_argument("Given pointer to flatbuffer blob is null"); + } + const dml::ir::DmlGraphDesc* flatbufferGraphDesc = dml::ir::GetDmlGraphDesc(flatbufferGraphDescBlob); + + std::unordered_map graphInputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphInputNames()); + std::unordered_map graphOutputEdgeToIndexMap = ConvertToEdgeNameToIndexMap(flatbufferGraphDesc->graphOutputNames()); + + std::unordered_map edgeToOutgoingNodeIndexMap; + std::unordered_set constantInputs; + + std::vector nodes(flatbufferGraphDesc->nodes()->size()); + std::vector inputEdges; + std::vector outputEdges; + std::vector intermediateEdges; + + for (uint32_t nodeIndex = 0; nodeIndex < flatbufferGraphDesc->nodes()->size(); nodeIndex++) + { + const dml::ir::DmlGraphNode* flatbufferNode = flatbufferGraphDesc->nodes()->Get(nodeIndex); + + PopulateEdges( + nodeIndex, + flatbufferNode->inputNames(), + graphInputEdgeToIndexMap, + inputEdges, + intermediateEdges, + edgeToOutgoingNodeIndexMap); + PopulateEdges( + nodeIndex, + flatbufferNode->outputNames(), + graphOutputEdgeToIndexMap, + outputEdges, + intermediateEdges, + edgeToOutgoingNodeIndexMap); + + DmlSerializedGraphNode node = {}; + if (flatbufferNode->name()->size() == 0) + { + throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + " doesn't have any name"); + } + node.Name = flatbufferNode->name()->c_str(); + + if (flatbufferNode->desc_type() == dml::ir::NodeDesc_ConstantNodeDesc) + { + const dml::ir::ConstantNodeDesc* flatbufferConstantNode = flatbufferNode->desc_as_ConstantNodeDesc(); + if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantName) + { + if (flatbufferConstantNode->data_as_ConstantName()->name()->size() == 0) + { + throw std::invalid_argument("Constant node at index:" + std::to_string(nodeIndex) + + " doesn't have constant data name."); + } + + ConstantName constantNode = {flatbufferConstantNode->data_as_ConstantName()->name()->c_str()}; + node.Desc = constantNode; + // output of this node will part of constantInputs list + for (uint32_t outputIndex = 0; outputIndex < flatbufferNode->outputNames()->size(); outputIndex++) + { + constantInputs.insert(flatbufferNode->outputNames()->Get(outputIndex)->c_str()); + } + } + else if (flatbufferConstantNode->data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData) + { + + uint32_t rawDataSize = flatbufferConstantNode->data_as_ConstantRawData()->data()->size(); + rawData.push_back(std::make_unique(rawDataSize)); + std::transform( + flatbufferConstantNode->data_as_ConstantRawData()->data()->begin(), + flatbufferConstantNode->data_as_ConstantRawData()->data()->end(), + rawData.back().get(), + [](uint8_t b) {return static_cast(b);}); + + ConstantData constantData = {}; + constantData.dataSize = rawDataSize; + constantData.data = rawData.back().get(); + node.Desc = constantData; + } + + + } + else if (flatbufferNode->desc_type() == dml::ir::NodeDesc::NodeDesc_OperatorNodeDesc) + { + // convert dml::ir::OperatorNodeDesc to AbstractOperatorDesc + const dml::ir::OperatorNodeDesc* flatbufferOperatorNodeDesc = flatbufferNode->desc_as_OperatorNodeDesc(); + node.Desc = CreateAbstractOperatorDesc( + nodeIndex, + flatbufferOperatorNodeDesc, + flatbufferNode->inputNames(), + flatbufferNode->outputNames(), + constantInputs); + } + + nodes[nodeIndex] = node; + } + + DmlSerializedGraphDesc graphDesc; + graphDesc.InputCount = flatbufferGraphDesc->graphInputNames()->size(); + graphDesc.OutputCount = flatbufferGraphDesc->graphOutputNames()->size(); + graphDesc.InputEdges = std::move(inputEdges); + graphDesc.IntermediateEdges = std::move(intermediateEdges); + graphDesc.OutputEdges = std::move(outputEdges); + graphDesc.Nodes = std::move(nodes); + return graphDesc; +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp index 642d9aa03eeef..202b762d99e01 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp @@ -135,8 +135,10 @@ namespace DmlGraphFusionHelper void ProcessInputData( const ExecutionProviderImpl* providerImpl, + const bool graphSerializationEnabled, const std::vector& isInputsUploadedByDmlEP, - const std::vector& inputEdges, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex, const gsl::span subGraphInputArgNames, const std::unordered_map>& initializerNameToInitializerMap, onnxruntime::Graph& graph, @@ -162,8 +164,17 @@ namespace DmlGraphFusionHelper // Walk through each graph edge and mark used inputs inputsUsed.assign(fusedNodeInputCount, false); - for (const DML_INPUT_GRAPH_EDGE_DESC& edge : inputEdges) { - inputsUsed[edge.GraphInputIndex] = true; + for (auto it = serializedGraphInputIndexToSubgraphInputIndex->begin(); it != serializedGraphInputIndexToSubgraphInputIndex->end(); it++) { + inputsUsed[it->second] = true; + } + for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex->begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex->end(); it++) { + inputsUsed[it->second] = true; + } + + std::wstring modelName; + if (graphSerializationEnabled) + { + modelName = GetModelName(graph.ModelPath()); } for (uint32_t i = 0; i < initInputBindings.size(); i++) @@ -209,6 +220,10 @@ namespace DmlGraphFusionHelper // Tensor sizes in DML must be a multiple of 4 bytes large. tensorByteSize = AlignToPow2(tensorByteSize, 4); + if(graphSerializationEnabled) + { + WriteToFile(modelName, ConvertToWString(iter->first) + L".bin", reinterpret_cast(tensorPtr), tensorByteSize); + } if (inputRawData) { @@ -287,55 +302,158 @@ namespace DmlGraphFusionHelper return initializerPartitionMap; } + inline uint32_t GetConstantNodeGraphInputIndex( + const std::string& constantName, + const std::unordered_map* serializedGraphConstantNameToMainGraphInputIndex, + uint32_t& graphMaxInputIndex, + std::unordered_map& localConstantNameToIndexMap) + { + if (serializedGraphConstantNameToMainGraphInputIndex == nullptr) + { + if (localConstantNameToIndexMap.find(constantName) == localConstantNameToIndexMap.end()) + { + localConstantNameToIndexMap[constantName] = ++graphMaxInputIndex; + } + return localConstantNameToIndexMap[constantName]; + } + else + { + graphMaxInputIndex = std::max(graphMaxInputIndex, serializedGraphConstantNameToMainGraphInputIndex->at(constantName)); + return serializedGraphConstantNameToMainGraphInputIndex->at(constantName); + } + } + + template void ConvertGraphDesc( const Dml::GraphDescBuilder::GraphDesc& graphDesc, - _Out_ DML_GRAPH_DESC& dmlGraphDesc, const uint32_t inputCount, const uint32_t outputCount, - _Inout_ std::vector& dmlOperatorGraphNodes, - _Inout_ std::vector& dmlConstantGraphNodes, + IDMLDevice* device, + StackAllocator& allocator, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex, + _Out_ DML_GRAPH_DESC& dmlGraphDesc, + _Inout_ std::vector>& dmlOperators, _Inout_ std::vector& dmlGraphNodes, _Inout_ std::vector& dmlInputEdges, _Inout_ std::vector& dmlOutputEdges, _Inout_ std::vector& dmlIntermediateEdges) { - for (size_t i = 0; i < graphDesc.nodes.size(); ++i) + std::unordered_map oldNodeIndexToNewNodeIndexMap; + for (uint32_t index = 0; index < static_cast(graphDesc.Nodes.size()); index++) { - auto& nodeInfo = graphDesc.nodes[i]; - - if (std::holds_alternative>(nodeInfo.nodeDef)) + const DmlSerializedGraphNode& node = graphDesc.Nodes[index]; + if (std::holds_alternative(node.Desc)) { - dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{std::get>(nodeInfo.nodeDef).Get(), nodeInfo.name.data()}; - dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]}; + oldNodeIndexToNewNodeIndexMap[index] = static_cast(dmlGraphNodes.size()); + DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc(std::get(node.Desc), &allocator); + ComPtr op; + ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op))); + dmlOperators.push_back(op); + DML_OPERATOR_GRAPH_NODE_DESC* dmlOperatorGraphNode = allocator.template Allocate(); + dmlOperatorGraphNode->Name = node.Name.data(); + dmlOperatorGraphNode->Operator = op.Get(); + dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, dmlOperatorGraphNode}); } else { - auto& nodeDefinitionData = std::get>(nodeInfo.nodeDef); - dmlConstantGraphNodes[i] = DML_CONSTANT_DATA_GRAPH_NODE_DESC{ - nodeDefinitionData.data(), - nodeDefinitionData.size(), - nodeInfo.name.data() - }; - - // TODO: Change as new header is ingested - dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{static_cast(2), &dmlConstantGraphNodes[i]}; + auto& constantNodeVariant = std::get(node.Desc); + if (std::holds_alternative(constantNodeVariant)) + { + oldNodeIndexToNewNodeIndexMap[index] = static_cast(dmlGraphNodes.size()); + + auto& constantData = std::get(constantNodeVariant); + + DML_CONSTANT_DATA_GRAPH_NODE_DESC* constantNode = allocator.template Allocate(); + constantNode->Name = node.Name.data(); + constantNode->DataSize = constantData.dataSize; + constantNode->Data = constantData.data; + dmlGraphNodes.push_back(DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_CONSTANT, constantNode}); + } } } - for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i) + uint32_t graphMaxInputIndex = 0; + + for (size_t i = 0; i < graphDesc.InputEdges.size(); ++i) { - dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i]}; + DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate(); + // 1. If serializedGraphInputIndexToMainGraphInputIndex is not null: + // then use the corresponding main graph input index, because the caller will use corresponding + // main graph input index for extracting the actual input tensor from the main graph and + // the caller does not own the creation of dml bindings directly. + // Use Case: When the caller is ORT (DML EP) or DmlEngine. + // + // 2. If serializedGraphInputIndexToMainGraphInputIndex is null: + // then assign the sequential graph input index, because it owns the creation of dml bindings + // directly. + edge->GraphInputIndex = serializedGraphInputIndexToSubgraphInputIndex == nullptr ? + graphDesc.InputEdges[i].GraphInputIndex : + serializedGraphInputIndexToSubgraphInputIndex->at(graphDesc.InputEdges[i].GraphInputIndex); + edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.InputEdges[i].ToNodeIndex]; + edge->ToNodeInputIndex = graphDesc.InputEdges[i].ToNodeInputIndex; + edge->Name = graphDesc.InputEdges[i].Name.data(); + + graphMaxInputIndex = std::max(graphMaxInputIndex, edge->GraphInputIndex); + dmlInputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, edge}); } - for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i) + for (size_t i = 0; i < graphDesc.OutputEdges.size(); ++i) { - dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i]}; + DML_OUTPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate(); + edge->GraphOutputIndex = graphDesc.OutputEdges[i].GraphOutputIndex; + edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.OutputEdges[i].FromNodeIndex]; + edge->FromNodeOutputIndex = graphDesc.OutputEdges[i].FromNodeOutputIndex; + edge->Name = graphDesc.OutputEdges[i].Name.data(); + + dmlOutputEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, edge}); } - for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i) + std::unordered_map localConstantNameToIndexMap; + for (uint32_t i = 0; i < static_cast(graphDesc.IntermediateEdges.size()); ++i) { - dmlIntermediateEdges[i] = - DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i]}; + DmlSerializedGraphNodeDescVariant descVariant = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Desc; + bool isConstantEdge = std::holds_alternative(descVariant); + if (isConstantEdge) + { + auto& constantNodeVariant = std::get(descVariant); + if (std::holds_alternative(constantNodeVariant)) + { + DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate(); + edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex]; + edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex; + edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex]; + edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex; + edge->Name = graphDesc.IntermediateEdges[i].Name.data(); + dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge}); + } + else + { + const std::string& constantName = graphDesc.Nodes[graphDesc.IntermediateEdges[i].FromNodeIndex].Name; + + DML_INPUT_GRAPH_EDGE_DESC* edge = allocator.template Allocate(); + edge->GraphInputIndex = GetConstantNodeGraphInputIndex( + constantName, + serializedGraphLargeConstantNameToSubgraphInputIndex, + graphMaxInputIndex, + localConstantNameToIndexMap); + edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex]; + edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex; + edge->Name = graphDesc.IntermediateEdges[i].Name.data(); + + dmlInputEdges.push_back({DML_GRAPH_EDGE_TYPE_INPUT, edge}); + } + } + else + { + DML_INTERMEDIATE_GRAPH_EDGE_DESC* edge = allocator.template Allocate(); + edge->FromNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].FromNodeIndex]; + edge->FromNodeOutputIndex = graphDesc.IntermediateEdges[i].FromNodeOutputIndex; + edge->ToNodeIndex = oldNodeIndexToNewNodeIndexMap[graphDesc.IntermediateEdges[i].ToNodeIndex]; + edge->ToNodeInputIndex = graphDesc.IntermediateEdges[i].ToNodeInputIndex; + edge->Name = graphDesc.IntermediateEdges[i].Name.data(); + dmlIntermediateEdges.push_back(DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, edge}); + } } dmlGraphDesc.InputCount = inputCount; @@ -400,27 +518,34 @@ namespace DmlGraphFusionHelper Microsoft::WRL::ComPtr TryCreateCompiledOperator( const GraphDescBuilder::GraphDesc& graphDesc, const onnxruntime::IndexedSubGraph& indexedSubGraph, - const ExecutionProviderImpl* providerImpl) + const ExecutionProviderImpl* providerImpl, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex) { const uint32_t fusedNodeInputCount = gsl::narrow_cast(indexedSubGraph.GetMetaDef()->inputs.size()); const uint32_t fusedNodeOutputCount = gsl::narrow_cast(indexedSubGraph.GetMetaDef()->outputs.size()); // convert DML EP GraphDesc into DML_GRAPH_DESC and create IDMLCompiledOperator - DML_GRAPH_DESC dmlGraphDesc = {}; - std::vector dmlOperatorGraphNodes(graphDesc.nodes.size()); - std::vector dmlConstantGraphNodes(graphDesc.nodes.size()); + ComPtr device; + ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf())); - std::vector dmlGraphNodes(graphDesc.nodes.size()); - std::vector dmlInputEdges(graphDesc.inputEdges.size()); - std::vector dmlOutputEdges(graphDesc.outputEdges.size()); - std::vector dmlIntermediateEdges(graphDesc.intermediateEdges.size()); + StackAllocator<1024> allocator; + DML_GRAPH_DESC dmlGraphDesc = {}; + std::vector> dmlOperators; + std::vector dmlGraphNodes; + std::vector dmlInputEdges; + std::vector dmlOutputEdges; + std::vector dmlIntermediateEdges; ConvertGraphDesc( graphDesc, - dmlGraphDesc, fusedNodeInputCount, fusedNodeOutputCount, - dmlOperatorGraphNodes, - dmlConstantGraphNodes, + device.Get(), + allocator, + serializedGraphInputIndexToSubgraphInputIndex, + serializedGraphLargeConstantNameToSubgraphInputIndex, + dmlGraphDesc, + dmlOperators, dmlGraphNodes, dmlInputEdges, dmlOutputEdges, @@ -438,8 +563,6 @@ namespace DmlGraphFusionHelper executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS; } - ComPtr device; - ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf())); ComPtr device1; ORT_THROW_IF_FAILED(device.As(&device1)); @@ -460,6 +583,7 @@ namespace DmlGraphFusionHelper } void FusePartitionAndRegisterKernel( + const uint32_t partitionIndex, onnxruntime::Graph& graph, onnxruntime::KernelRegistry* registryForPartitionKernels, const std::unordered_map>& initializerNameToInitializerMap, @@ -467,8 +591,43 @@ namespace DmlGraphFusionHelper const onnxruntime::IndexedSubGraph& indexedSubGraph, std::vector&& isInputsUploadedByDmlEP, const GraphDescBuilder::GraphDesc& graphDesc, - Microsoft::WRL::ComPtr compiledExecutionPlanOperator) + Microsoft::WRL::ComPtr compiledExecutionPlanOperator, + const bool graphSerializationEnabled, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex) { + if (graphSerializationEnabled) + { + + const std::wstring modelName = GetModelName(graph.ModelPath()); + auto buffer = SerializeDmlGraph(graphDesc); + + const std::wstring partitionName = + L"Partition_" + + std::to_wstring(partitionIndex) + + L".bin"; + WriteToFile(modelName, partitionName, buffer.data(), buffer.size()); + + std::vector> rawData; + DmlSerializedGraphDesc deserializedGraphDesc = DeserializeDmlGraph(buffer.data(), rawData); + GraphDescBuilder::GraphDesc deserializedDmlGraphDesc = {}; + deserializedDmlGraphDesc.InputCount = deserializedGraphDesc.InputCount; + deserializedDmlGraphDesc.InputEdges = std::move(deserializedGraphDesc.InputEdges); + deserializedDmlGraphDesc.IntermediateEdges = std::move(deserializedGraphDesc.IntermediateEdges); + deserializedDmlGraphDesc.Nodes = std::move(deserializedGraphDesc.Nodes); + deserializedDmlGraphDesc.OutputCount = deserializedGraphDesc.OutputCount; + deserializedDmlGraphDesc.OutputEdges = std::move(deserializedGraphDesc.OutputEdges); + deserializedDmlGraphDesc.reuseCommandList = graphDesc.reuseCommandList; + deserializedDmlGraphDesc.outputShapes = graphDesc.outputShapes; + + compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator( + deserializedDmlGraphDesc, + indexedSubGraph, + providerImpl, + serializedGraphInputIndexToSubgraphInputIndex, + serializedGraphLargeConstantNameToSubgraphInputIndex); + } + auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name); fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider); @@ -482,8 +641,10 @@ namespace DmlGraphFusionHelper std::vector inputsUsed; ProcessInputData( providerImpl, + graphSerializationEnabled, isInputsUploadedByDmlEP, - graphDesc.inputEdges, + serializedGraphInputIndexToSubgraphInputIndex, + serializedGraphLargeConstantNameToSubgraphInputIndex, indexedSubGraph.GetMetaDef()->inputs, initializerNameToInitializerMap, graph, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h index f8f6162aaa1e0..f1e9654021196 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h @@ -45,12 +45,17 @@ namespace DmlGraphFusionHelper gsl::span> partitions ); + template void ConvertGraphDesc( const Dml::GraphDescBuilder::GraphDesc& graphDesc, - _Out_ DML_GRAPH_DESC& dmlGraphDesc, const uint32_t inputCount, const uint32_t outputCount, - _Inout_ std::vector& dmlOperatorGraphNodes, + IDMLDevice* device, + StackAllocator& allocator, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex, + _Out_ DML_GRAPH_DESC& dmlGraphDesc, + _Inout_ std::vector>& dmlOperators, _Inout_ std::vector& dmlGraphNodes, _Inout_ std::vector& dmlInputEdges, _Inout_ std::vector& dmlOutputEdges, @@ -69,9 +74,12 @@ namespace DmlGraphFusionHelper Microsoft::WRL::ComPtr TryCreateCompiledOperator( const GraphDescBuilder::GraphDesc& graphDesc, const onnxruntime::IndexedSubGraph& indexedSubGraph, - const ExecutionProviderImpl* providerImpl); + const ExecutionProviderImpl* providerImpl, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex); void FusePartitionAndRegisterKernel( + const uint32_t partitionIndex, onnxruntime::Graph& graph, onnxruntime::KernelRegistry* registryForPartitionKernels, const std::unordered_map>& initializerNameToInitializerMap, @@ -79,7 +87,10 @@ namespace DmlGraphFusionHelper const onnxruntime::IndexedSubGraph& indexedSubGraph, std::vector&& isInputsUploadedByDmlEP, const GraphDescBuilder::GraphDesc& graphDesc, - Microsoft::WRL::ComPtr compiledExecutionPlanOperator); + Microsoft::WRL::ComPtr compiledExecutionPlanOperator, + const bool graphSerializationEnabled, + const std::unordered_map* serializedGraphInputIndexToSubgraphInputIndex = nullptr, + const std::unordered_map* serializedGraphLargeConstantNameToSubgraphInputIndex = nullptr); void RegisterDynamicKernel( onnxruntime::Graph& graph, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp index 679738b639ec9..35a2c451a49a5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp @@ -24,15 +24,20 @@ namespace Dml std::vector isInputsUploadedByDmlEP; GraphDescBuilder::GraphDesc graphDesc; std::unordered_map> isInitializerTransferable; + std::vector> smallConstantData; // Need to keep it alive for maintaining lifetime + std::unordered_map serializedGraphInputIndexToSubgraphInputIndex; + std::unordered_map serializedGraphLargeConstantNameToSubgraphInputIndex; }; } DmlGraphFusionTransformer::DmlGraphFusionTransformer( const std::string& name, - const onnxruntime::IExecutionProvider* provider + const onnxruntime::IExecutionProvider* provider, + const bool graphSerializationEnabled ) :onnxruntime::GraphTransformer(name), - m_providerImpl(static_cast(provider)->GetImpl()) + m_providerImpl(static_cast(provider)->GetImpl()), + graphSerializationEnabled(graphSerializationEnabled) { } @@ -227,23 +232,39 @@ namespace Dml ComPtr device; ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf())); + // This map will be used to transfer the initializer to D3D12 system heap memory. + // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why + // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition) + // input arg index. + // For ex: Let's say intermediate edge index = idx, then + // indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx]; + // corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]] + // We are using intermediate edge index as a key because same constant tensor can be used by + // multiple nodes. + std::unordered_map serializedGraphInputIndexToSubgraphInputIndex; + std::unordered_map serializedGraphLargeConstantNameToSubgraphInputIndex; + std::vector> smallConstantData; GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc( isInputsUploadedByDmlEP.data(), isInputsUploadedByDmlEP.size(), isInitializerTransferable, partitionNodePropsMap, - device.Get(), m_providerImpl, modelPath, subgraphNodes, subgraphInputs, - subgraphOutputs); + subgraphOutputs, + serializedGraphInputIndexToSubgraphInputIndex, + serializedGraphLargeConstantNameToSubgraphInputIndex, + smallConstantData); // Compile the operator auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator( graphDesc, indexedSubGraph, - m_providerImpl); + m_providerImpl, + &serializedGraphInputIndexToSubgraphInputIndex, + &serializedGraphLargeConstantNameToSubgraphInputIndex); if (!compiledPartition) { @@ -264,6 +285,9 @@ namespace Dml compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP); compiledPartitionInfo->graphDesc = std::move(graphDesc); compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable); + compiledPartitionInfo->smallConstantData = std::move(smallConstantData); + compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex = std::move(serializedGraphInputIndexToSubgraphInputIndex); + compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex = std::move(serializedGraphLargeConstantNameToSubgraphInputIndex); compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo); } } @@ -271,12 +295,14 @@ namespace Dml } while (!additionalSplittingNodes.empty()); + uint32_t partitionIndex = 0; for (auto&& compiledPartitionInfo : compiledPartitionInfos) { // Null compiled operators were not DML partitions if (compiledPartitionInfo) { DmlGraphFusionHelper::FusePartitionAndRegisterKernel( + partitionIndex++, graph, m_providerImpl->GetKernelRegistry().get(), compiledPartitionInfo->isInitializerTransferable, @@ -284,7 +310,10 @@ namespace Dml compiledPartitionInfo->indexedSubGraph, std::move(compiledPartitionInfo->isInputsUploadedByDmlEP), compiledPartitionInfo->graphDesc, - compiledPartitionInfo->compiledOperator); + compiledPartitionInfo->compiledOperator, + graphSerializationEnabled, + &compiledPartitionInfo->serializedGraphInputIndexToSubgraphInputIndex, + &compiledPartitionInfo->serializedGraphLargeConstantNameToSubgraphInputIndex); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h index 19dab0c89943c..b370f3ef9043c 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h @@ -16,7 +16,8 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer public: DmlGraphFusionTransformer( const std::string& name, - const onnxruntime::IExecutionProvider* provider + const onnxruntime::IExecutionProvider* provider, + const bool graphSerializationEnabled ); public: @@ -38,5 +39,6 @@ class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer private: const ExecutionProviderImpl* m_providerImpl = nullptr; + const bool graphSerializationEnabled = false; }; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp new file mode 100644 index 0000000000000..5355964e8db74 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphSerialization.cpp @@ -0,0 +1,580 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. + +#pragma once +#include "precomp.h" + +template +T* ReadAs(uint8_t* base, size_t byteOffset) +{ + return reinterpret_cast(base + byteOffset); +} + +void SerializeAttributeDescs( + flatbuffers::FlatBufferBuilder& builder, + const AbstractOperatorDesc& operatorDesc, + /*out*/ std::vector>& attributeDescs); + +flatbuffers::Offset serializeActivation( + flatbuffers::FlatBufferBuilder& builder, + const AbstractOperatorDesc& activationOperatorDesc) +{ + std::vector> attributeDescs; + SerializeAttributeDescs(builder, activationOperatorDesc, attributeDescs); + + flatbuffers::Offset offset = dml::ir::operatorFieldTypes::CreateActivationDirect( + builder, + activationOperatorDesc.schema->OperatorName, + &attributeDescs); + return offset; +} + +void SerializeAttributeDescs( + flatbuffers::FlatBufferBuilder& builder, + const AbstractOperatorDesc& operatorDesc, + /*out*/ std::vector>& attributeDescs) +{ + for (const OperatorField& field : operatorDesc.fields) + { + if (field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_INPUT_TENSOR || + field.GetSchema()->Kind == DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR) + { + continue; + } + + flatbuffers::Offset offset; + + if (std::holds_alternative(field.GetData())) + { + const OperatorFieldTypes::FusedActivationOperatorDesc& fusedActivation = field.AsFusedActivationOperatorDesc(); + if (!fusedActivation.has_value()) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + nullptr, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation); + } + else + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation, + serializeActivation(builder, fusedActivation.value()).Union()); + } + } + else if (std::holds_alternative(field.GetData())) + { + const OperatorFieldTypes::FusedActivationOperatorDescArray& fusedActivations = + field.AsFusedActivationOperatorDescArray(); + if (!fusedActivations.has_value()) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + nullptr, + dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray); + } + else + { + std::vector> fbActivations; + + for (AbstractOperatorDesc activationOpDesc : fusedActivations.value()) + { + flatbuffers::Offset fbActivation = + serializeActivation(builder, activationOpDesc); + fbActivations.push_back(fbActivation); + } + + flatbuffers::Offset activationOffset = + dml::ir::operatorFieldTypes::CreateActivationArrayDirect(builder, &fbActivations); + + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray, + activationOffset.Union()); + } + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32, + builder.CreateStruct(dml::ir::operatorFieldTypes::UInt32(field.AsUInt())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64, + builder.CreateStruct(dml::ir::operatorFieldTypes::UInt64(field.AsUInt64())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32, + builder.CreateStruct(dml::ir::operatorFieldTypes::Int32(field.AsInt())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32, + builder.CreateStruct(dml::ir::operatorFieldTypes::Float32(field.AsFloat())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray, + dml::ir::operatorFieldTypes::CreateUIntArray(builder, builder.CreateVector(field.AsUIntArray())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray, + dml::ir::operatorFieldTypes::CreateIntArray(builder, builder.CreateVector(field.AsIntArray())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray, + dml::ir::operatorFieldTypes::CreateFloatArray(builder, builder.CreateVector(field.AsFloatArray())).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + const OperatorFieldTypes::ScaleBias& scaleBias = field.AsScaleBias(); + if (!scaleBias.has_value()) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + nullptr, + dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias); + } + else + { + dml::ir::operatorFieldTypes::ScaleBias fbScaleBias(scaleBias.value().Scale, scaleBias.value().Bias); + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias, + builder.CreateStruct(fbScaleBias).Union()); + } + } + else if (std::holds_alternative(field.GetData())) + { + const DML_SIZE_2D size2d = field.AsSize2D(); + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D, + builder.CreateStruct(dml::ir::operatorFieldTypes::Size2D(size2d.Width, size2d.Height)).Union()); + } + else if (std::holds_alternative(field.GetData())) + { + OperatorFieldTypes::ScalarUnion scalarUnion = field.AsScalarUnion(); + dml::ir::operatorFieldTypes::ByteArray byteArr; + for (uint32_t index = 0; index < static_cast(sizeof(scalarUnion.Bytes)); index++) + { + byteArr.mutable_data()->Mutate(index, scalarUnion.Bytes[index]); + } + + flatbuffers::Offset scalarUnionOffset = + dml::ir::operatorFieldTypes::CreateScalarUnionData( + builder, + dml::ir::operatorFieldTypes::ScalarVariant_ByteArray, + builder.CreateStruct(byteArr).Union()); + + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData, + scalarUnionOffset.Union()); + } + else if (std::holds_alternative(field.GetData())) + { + offset = dml::ir::operatorFieldTypes::CreateAttributeDescDirect( + builder, + field.GetSchema()->Name, + dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool, + builder.CreateStruct(dml::ir::operatorFieldTypes::Bool(field.AsBool())).Union()); + } + else + { + continue; + } + + attributeDescs.push_back(offset); + } +} + +flatbuffers::Offset SerializeDmlTensorDesc( + flatbuffers::FlatBufferBuilder& builder, + const DmlBufferTensorDesc* tensorDesc) +{ + const std::vector *strides = nullptr; + if (tensorDesc->strides.has_value()) + { + strides = &tensorDesc->strides.value(); + } + + flatbuffers::Offset offset = dml::ir::CreateDmlBufferTensorDescDirect( + builder, + ApiTraits::StringifyHelpers::ToString(tensorDesc->dataType), + &tensorDesc->sizes, + strides, + tensorDesc->totalTensorSizeInBytes); + return offset; +} + +flatbuffers::Offset SerializeOperatorNodeDesc( + flatbuffers::FlatBufferBuilder& builder, + const AbstractOperatorDesc& operatorDesc) +{ + const DML_OPERATOR_SCHEMA* operatorSchema = operatorDesc.schema; + + std::vector> inputTensorDescs; + std::vector> outputTensorDescs; + + for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetInputTensors()) + { + if (tensorDesc == nullptr) + { + continue; + } + flatbuffers::Offset serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc); + inputTensorDescs.push_back(serializedDmlTensorDesc); + } + + for (const DmlBufferTensorDesc* tensorDesc : operatorDesc.GetOutputTensors()) + { + if (tensorDesc == nullptr) + { + continue; + } + flatbuffers::Offset serializedDmlTensorDesc = SerializeDmlTensorDesc(builder, tensorDesc); + outputTensorDescs.push_back(serializedDmlTensorDesc); + } + + std::vector> attributeDescs; + SerializeAttributeDescs(builder, operatorDesc, attributeDescs); + + flatbuffers::Offset offset = dml::ir::CreateOperatorNodeDesc( + builder, + builder.CreateString(operatorSchema->OperatorName), + builder.CreateVector(inputTensorDescs), + builder.CreateVector(outputTensorDescs), + builder.CreateVector(attributeDescs)); + return offset.Union(); +} + +flatbuffers::Offset SerializeConstantNodeDesc( + flatbuffers::FlatBufferBuilder& builder, + uint32_t nodeIndex, + const DmlSerializedGraphNodeConstantVariant& constantNodeDesc) +{ + flatbuffers::Offset offset; + + if (std::holds_alternative(constantNodeDesc)) + { + auto& constantName = std::get(constantNodeDesc); + if (constantName.name.empty()) + { + throw std::invalid_argument("Graph constant node at index:" + std::to_string(nodeIndex) + + " doesn't have the constant data name."); + } + + flatbuffers::Offset constantNameOffset = dml::ir::CreateConstantName( + builder, + builder.CreateString(constantName.name)); + + offset = dml::ir::CreateConstantNodeDesc( + builder, + dml::ir::ConstantNodeDescDetail_ConstantName, + constantNameOffset.Union()); + } + else + { + auto& constantData = std::get(constantNodeDesc); + std::vector rawBytes; + std::transform(constantData.data, constantData.data + constantData.dataSize, + std::back_inserter(rawBytes), [](std::byte b) {return static_cast(b); }); + flatbuffers::Offset constantDataOffset = dml::ir::CreateConstantRawDataDirect( + builder, + &rawBytes); + + offset = dml::ir::CreateConstantNodeDesc( + builder, + dml::ir::ConstantNodeDescDetail_ConstantRawData, + constantDataOffset.Union()); + } + + return offset.Union(); +} + +flatbuffers::Offset SerializeNode( + flatbuffers::FlatBufferBuilder& builder, + const uint32_t nodeIndex, + const DmlSerializedGraphNode& graphNode, + const std::vector>& nodeInputNames, + const std::vector>& nodeOutputNames) +{ + if (graphNode.Name.empty()) + { + throw std::invalid_argument("Graph node at index:" + std::to_string(nodeIndex) + + " does not have any name."); + } + + flatbuffers::Offset offset; + if (std::holds_alternative(graphNode.Desc)) + { + auto& operatorNode = std::get(graphNode.Desc); + offset = dml::ir::CreateDmlGraphNode( + builder, + dml::ir::NodeDesc_OperatorNodeDesc, + SerializeOperatorNodeDesc(builder, operatorNode), + builder.CreateString(graphNode.Name), + builder.CreateVector(nodeInputNames), + builder.CreateVector(nodeOutputNames)); + } + else + { + auto& constantNodeVariant = std::get(graphNode.Desc); + offset = dml::ir::CreateDmlGraphNode( + builder, + dml::ir::NodeDesc_ConstantNodeDesc, + SerializeConstantNodeDesc(builder, nodeIndex, constantNodeVariant), + builder.CreateString(graphNode.Name), + builder.CreateVector(nodeInputNames), + builder.CreateVector(nodeOutputNames)); + } + return offset; +} + +/* +* validates input/output edges and throws exception if an edge +* does not have a name or if an edge has more than 1 names. +*/ +template +std::unordered_map> ConvertToEdgeIndexToNameMap( + const std::vector& edges, + flatbuffers::FlatBufferBuilder& builder) +{ + std::unordered_map> edgeIndexToNameMap; + for (auto& edge : edges) + { + uint32_t index; + if constexpr (std::is_same_v) + { + index = edge.GraphInputIndex; + } + else if constexpr (std::is_same_v) + { + index = edge.GraphOutputIndex; + } + + if (edge.Name.empty()) + { + throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " does not have name."); + } + + if (edgeIndexToNameMap.find(index) != edgeIndexToNameMap.end()) + { + flatbuffers::String* edgeName = ReadAs( + builder.GetCurrentBufferPointer(), + builder.GetSize() - edgeIndexToNameMap[index].o); + if (edge.Name != edgeName->str()) + { + throw std::invalid_argument("Graph input or output edge at index " + std::to_string(index) + " has more than 1 names."); + } + } + + edgeIndexToNameMap[index] = builder.CreateString(edge.Name); + } + return edgeIndexToNameMap; // NRVO will automatically move it. no need to use std::move +} + +void PopulateNonConstantNodeInputOutputCount( + const std::vector& nodes, + /*out*/ std::vector& nodeInputCounts, + /*out*/ std::vector& nodeOutputCounts) +{ + for (uint32_t nodeIndex = 0; nodeIndex < static_cast(nodes.size()); nodeIndex++) + { + auto& node = nodes[nodeIndex]; + if (std::holds_alternative(node.Desc)) + { + auto& operatorNode = std::get(node.Desc); + nodeInputCounts[nodeIndex] = std::max( + nodeInputCounts[nodeIndex], + static_cast(operatorNode.GetInputTensors().size())); + + nodeOutputCounts[nodeIndex] = std::max( + nodeOutputCounts[nodeIndex], + static_cast(operatorNode.GetOutputTensors().size())); + } + } +} + +void PopulateConstantNodeInputOutputCount( + const std::vector& edges, + /*out*/std::vector& maxInputIndexForNodes, + /*out*/std::vector& maxOutputIndexForNodes) +{ + for (auto& edge : edges) + { + maxInputIndexForNodes[edge.ToNodeIndex] = std::max(maxInputIndexForNodes[edge.ToNodeIndex], edge.ToNodeInputIndex + 1); + maxOutputIndexForNodes[edge.FromNodeIndex] = std::max(maxOutputIndexForNodes[edge.FromNodeIndex], edge.FromNodeOutputIndex + 1); + } +} + +/* +* validates intermediate edge and throws exception if an edge +* does not have a name or if an edge has more than 1 names. +*/ +void PopulateNodeInputOutputNames( + flatbuffers::FlatBufferBuilder& builder, + const DmlSerializedGraphDesc& graphDesc, + const std::unordered_map>& graphInputIndexToNameMap, + const std::unordered_map>& graphOutputIndexToNameMap, + /*out*/std::vector>>& nodeToInputNames, + /*out*/std::vector>>& nodeToOutputNames) +{ + for (auto& edge : graphDesc.InputEdges) + { + nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = graphInputIndexToNameMap.at(edge.GraphInputIndex); + } + + for (auto& edge : graphDesc.OutputEdges) + { + nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = graphOutputIndexToNameMap.at(edge.GraphOutputIndex); + } + + std::unordered_map>> intermediateEdgeNames; + for (uint32_t edgeIndex = 0; edgeIndex < static_cast(graphDesc.IntermediateEdges.size()); edgeIndex++) + { + auto& edge = graphDesc.IntermediateEdges[edgeIndex]; + if (edge.Name.empty()) + { + throw std::invalid_argument( + "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + + " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " doesn't have name."); + } + + if (intermediateEdgeNames.find(edge.FromNodeIndex) != intermediateEdgeNames.end() && + intermediateEdgeNames[edge.FromNodeIndex].find(edge.FromNodeOutputIndex) != intermediateEdgeNames[edge.FromNodeIndex].end()) + { + flatbuffers::Offset edgeNameOffset = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex]; + flatbuffers::String* edgeName = ReadAs( + builder.GetCurrentBufferPointer(), + builder.GetSize() - edgeNameOffset.o); + + if (edgeName->str() != edge.Name) + { + throw std::invalid_argument( + "Graph intermediate edge from nodeIndex:" + std::to_string(edge.FromNodeIndex) + + " & nodeOutputIndex:" + std::to_string(edge.FromNodeOutputIndex) + " has more than 1 names."); + } + } + else + { + intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = builder.CreateString(edge.Name.c_str()); + } + nodeToInputNames[edge.ToNodeIndex][edge.ToNodeInputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex]; + nodeToOutputNames[edge.FromNodeIndex][edge.FromNodeOutputIndex] = intermediateEdgeNames[edge.FromNodeIndex][edge.FromNodeOutputIndex]; + } +} + + +/* +* - If an edge is connected to multiple nodes, then there will be multiple instances +* of input or intermediate edges, all with the same name. +* - The input will be validated incrementally throughout the execution +* of the method. +* - Handling of empty optional input/output/attibute for non-constant node: +* input/output +* - and will have an null entry +* but the actual OperatorNodeDesc variant's +* and will not have any entry. +* attribute +* - will have null entry +*/ +flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc) +{ + + flatbuffers::FlatBufferBuilder builder(1024); + if (graphDesc.Nodes.empty()) + { + return builder.Release(); + } + + // create input/output edge index to name map + std::unordered_map> graphInputIndexToNameMap = + ConvertToEdgeIndexToNameMap(graphDesc.InputEdges, builder); + std::unordered_map> graphOutputIndexToNameMap = + ConvertToEdgeIndexToNameMap(graphDesc.OutputEdges, builder); + + /* + * - Calculate number of input/output for each operator to allocate + * appropriate amount of memory for each node to store input/output names. + * - Non-constant node's input/output count can be determined by the + * AbstractOperatorDesc. + * - Constant node will only have outgoing edges and those outgoing edges + * will be intermediate edges. + */ + std::vector nodeInputCounts(graphDesc.Nodes.size(), 0); + std::vector nodeOutputCounts(graphDesc.Nodes.size(), 0); + PopulateNonConstantNodeInputOutputCount(graphDesc.Nodes, nodeInputCounts, nodeOutputCounts); + PopulateConstantNodeInputOutputCount(graphDesc.IntermediateEdges, nodeInputCounts, nodeOutputCounts); + + // populate node input/output names. + std::vector>> nodeToInputNames(graphDesc.Nodes.size()); + std::vector>> nodeToOutputNames(graphDesc.Nodes.size()); + for (uint32_t nodeIndex = 0; nodeIndex < static_cast(graphDesc.Nodes.size()); nodeIndex++) + { + nodeToInputNames[nodeIndex].assign(nodeInputCounts[nodeIndex], builder.CreateString(nullptr, 0)); + nodeToOutputNames[nodeIndex].assign(nodeOutputCounts[nodeIndex], builder.CreateString(nullptr, 0)); + } + PopulateNodeInputOutputNames(builder, graphDesc, graphInputIndexToNameMap, graphOutputIndexToNameMap, nodeToInputNames, nodeToOutputNames); + + // Create flatbuffer node objects + std::vector> nodes(graphDesc.Nodes.size()); + for (uint32_t nodeIndex = 0; nodeIndex < static_cast(graphDesc.Nodes.size()); nodeIndex++) + { + nodes[nodeIndex] = SerializeNode( + builder, + nodeIndex, + graphDesc.Nodes[nodeIndex], + nodeToInputNames[nodeIndex], + nodeToOutputNames[nodeIndex]); + } + + // Convert to std::vector to create the object. + std::vector> graphInputNames(graphDesc.InputCount, builder.CreateString(nullptr, 0)); + std::vector> graphOutputNames(graphDesc.OutputCount, builder.CreateString(nullptr, 0)); + for (const auto& [key, value] : graphInputIndexToNameMap) + { + graphInputNames[key] = value; + } + for (const auto& [key, value] : graphOutputIndexToNameMap) + { + graphOutputNames[key] = value; + } + + flatbuffers::Offset dmlGraphDescOffset = dml::ir::CreateDmlGraphDescDirect( + builder, + &nodes, + &graphInputNames, + &graphOutputNames); + builder.Finish(dmlGraphDescOffset); + return builder.Release(); +} diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp index 5c7b7bff1e370..0f0d445a95bae 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp @@ -180,32 +180,50 @@ namespace Dml // Convert partitionONNXGraph into DML EP GraphDesc ComPtr device; ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf())); + // This map will be used to transfer the initializer to D3D12 system heap memory. + // 'serializedDmlGraphDesc' will have constant input as intermediate edges, that's why + // we need a mapping between intermediateEdgeIndex and indexedSubGraph's (a given partition) + // input arg index. + // For ex: Let's say intermediate edge index = idx, then + // indexedSubGraphInputArgIdx = constantEdgeIdxToSubgraphInputArgIdxMap[idx]; + // corresponding constant tensor = initializerNameToInitializerMap[indexedSubGraph.GetMetaDef()->inputs[indexedSubGraphInputArgIdx]] + // We are using intermediate edge index as a key because same constant tensor can be used by + // multiple nodes. + std::unordered_map serializedGraphInputIndexToSubgraphInputIndex; + std::unordered_map serializedGraphLargeConstantNameToSubgraphInputIndex; + std::vector> smallConstantData; GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc( isInputsUploadedByDmlEP.data(), isInputsUploadedByDmlEP.size(), m_isInitializerTransferable, m_partitionNodePropsMap, - device.Get(), providerImpl, m_modelPath, m_subgraphNodePointers, m_subgraphInputs, - m_subgraphOutputs); + m_subgraphOutputs, + serializedGraphInputIndexToSubgraphInputIndex, + serializedGraphLargeConstantNameToSubgraphInputIndex, + smallConstantData); m_outputShapes = graphDesc.outputShapes; // Walk through each graph edge and mark used inputs m_inputsUsed.resize(fusedNodeInputCount, false); - for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges) - { - m_inputsUsed[edge.GraphInputIndex] = true; + for (auto it = serializedGraphInputIndexToSubgraphInputIndex.begin(); it != serializedGraphInputIndexToSubgraphInputIndex.end(); it++) { + m_inputsUsed[it->second] = true; + } + for (auto it = serializedGraphLargeConstantNameToSubgraphInputIndex.begin(); it != serializedGraphLargeConstantNameToSubgraphInputIndex.end(); it++) { + m_inputsUsed[it->second] = true; } // Compile the operator m_compiledExecutionPlanOperator = DmlGraphFusionHelper::TryCreateCompiledOperator( graphDesc, *m_indexedSubGraph, - providerImpl); + providerImpl, + &serializedGraphInputIndexToSubgraphInputIndex, + &serializedGraphLargeConstantNameToSubgraphInputIndex); // Queue references to objects which must be kept alive until resulting GPU work completes m_winmlProvider->QueueReference(m_compiledExecutionPlanOperator.Get()); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h index a5415ba85f3d3..e1e7eacfbd85d 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/ApiTraits.h @@ -24,8 +24,8 @@ struct EnumTraits template <> struct EnumTraits { - static constexpr auto ValueCount = 161; - static constexpr size_t ActivationFunctionCount = 24; + static constexpr auto ValueCount = 168; + static constexpr size_t ActivationFunctionCount = 26; }; template <> @@ -62,7 +62,7 @@ struct EnumTraits template <> struct EnumTraits { - static constexpr auto ValueCount = 4; + static constexpr auto ValueCount = 5; }; template <> @@ -86,7 +86,7 @@ struct EnumTraits template <> struct EnumTraits { - static constexpr auto ValueCount = 8; + static constexpr auto ValueCount = 13; }; template <> @@ -119,6 +119,12 @@ struct EnumTraits static constexpr auto ValueCount = 1; }; +template <> +struct EnumTraits +{ + static constexpr auto ValueCount = 5; +}; + template constexpr auto EnumValueCount = EnumTraits::ValueCount; @@ -495,12 +501,6 @@ struct OperatorDescTraits static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ROI_POOLING; }; -template <> -struct OperatorDescTraits -{ - static constexpr DML_OPERATOR_TYPE Type = (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING; -}; - template <> struct OperatorDescTraits { @@ -1029,6 +1029,24 @@ struct OperatorDescTraits static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_DIAGONAL_MATRIX1; }; +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION; +}; + +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING; +}; + +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT; +}; + template <> struct OperatorDescTraits { @@ -1174,9 +1192,15 @@ struct OperatorDescTraits }; template <> -struct OperatorDescTraits +struct OperatorDescTraits { - static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_MULTIHEAD_ATTENTION; + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_SWISH; +}; + +template <> +struct OperatorDescTraits +{ + static constexpr DML_OPERATOR_TYPE Type = DML_OPERATOR_ACTIVATION_HARD_SWISH; }; template @@ -1502,12 +1526,6 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ROI_POOLING> using DescType = DML_ROI_POOLING_OPERATOR_DESC; }; -template <> -struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING> -{ - using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC; -}; - template <> struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_SLICE> { @@ -2036,6 +2054,24 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_DIAGONAL_MATRIX1> using DescType = DML_DIAGONAL_MATRIX1_OPERATOR_DESC; }; +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION> +{ + using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC; +}; + +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING> +{ + using DescType = DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC; +}; + +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT> +{ + using DescType = DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC; +}; + template <> struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_ELU> { @@ -2181,14 +2217,20 @@ struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_GELU> }; template <> -struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_MULTIHEAD_ATTENTION> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_SWISH> { - using DescType = DML_MULTIHEAD_ATTENTION_OPERATOR_DESC; + using DescType = DML_ACTIVATION_SWISH_OPERATOR_DESC; +}; + +template <> +struct OperatorTypeTraits<(DML_OPERATOR_TYPE)DML_OPERATOR_ACTIVATION_HARD_SWISH> +{ + using DescType = DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC; }; // Calls a visitor functor, supplying an empty operator desc corresponding to the given DML_OPERATOR_TYPE as // the first argument. -// +// // For example: // Visit(DML_OPERATOR_ELEMENT_WISE_IDENTITY, [](auto tag) { // using T = decltype(tag); // T is one of the DML_*_OPERATOR_DESC structs @@ -2485,6 +2527,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args return std::invoke(std::forward(visitor), DML_DIAGONAL_MATRIX1_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_MULTIHEAD_ATTENTION: return std::invoke(std::forward(visitor), DML_MULTIHEAD_ATTENTION_OPERATOR_DESC{}, std::forward(args)...); + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: + return std::invoke(std::forward(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward(args)...); + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: + return std::invoke(std::forward(visitor), DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_ACTIVATION_ELU: return std::invoke(std::forward(visitor), DML_ACTIVATION_ELU_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_ACTIVATION_CELU: @@ -2533,13 +2579,10 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args return std::invoke(std::forward(visitor), DML_ACTIVATION_SHRINK_OPERATOR_DESC{}, std::forward(args)...); case DML_OPERATOR_ACTIVATION_GELU: return std::invoke(std::forward(visitor), DML_ACTIVATION_GELU_OPERATOR_DESC{}, std::forward(args)...); - -#pragma warning(push) -#pragma warning(disable: 4063) - case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: - return std::invoke(std::forward(visitor), DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC{}, std::forward(args)...); -#pragma warning(pop) - + case DML_OPERATOR_ACTIVATION_SWISH: + return std::invoke(std::forward(visitor), DML_ACTIVATION_SWISH_OPERATOR_DESC{}, std::forward(args)...); + case DML_OPERATOR_ACTIVATION_HARD_SWISH: + return std::invoke(std::forward(visitor), DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC{}, std::forward(args)...); default: ORT_THROW_HR(E_INVALIDARG); return std::invoke(std::forward(visitor), DML_ACTIVATION_RELU_OPERATOR_DESC{}, std::forward(args)...); @@ -2547,7 +2590,55 @@ auto OperatorTypeVisitor(DML_OPERATOR_TYPE type, Visitor&& visitor, Ts&&... args } #pragma warning(pop) +namespace StringifyHelpers +{ +template +inline gsl::czstring ToString(T value) +{ +#ifndef WAI_BUILD_LINUX + // Clang will instantiate this template even if it isn't used, + // so this static_assert will always fire and break the build. + static_assert(false, "Not implemented for this type"); +#endif +} + +template <> +inline gsl::czstring ToString(DML_TENSOR_DATA_TYPE value) +{ + switch (value) + { + case DML_TENSOR_DATA_TYPE_UNKNOWN: return "DML_TENSOR_DATA_TYPE_UNKNOWN"; + case DML_TENSOR_DATA_TYPE_FLOAT32: return "DML_TENSOR_DATA_TYPE_FLOAT32"; + case DML_TENSOR_DATA_TYPE_FLOAT16: return "DML_TENSOR_DATA_TYPE_FLOAT16"; + case DML_TENSOR_DATA_TYPE_UINT32: return "DML_TENSOR_DATA_TYPE_UINT32"; + case DML_TENSOR_DATA_TYPE_UINT16: return "DML_TENSOR_DATA_TYPE_UINT16"; + case DML_TENSOR_DATA_TYPE_UINT8: return "DML_TENSOR_DATA_TYPE_UINT8"; + case DML_TENSOR_DATA_TYPE_INT32: return "DML_TENSOR_DATA_TYPE_INT32"; + case DML_TENSOR_DATA_TYPE_INT16: return "DML_TENSOR_DATA_TYPE_INT16"; + case DML_TENSOR_DATA_TYPE_INT8: return "DML_TENSOR_DATA_TYPE_INT8"; + case DML_TENSOR_DATA_TYPE_FLOAT64: return "DML_TENSOR_DATA_TYPE_FLOAT64"; + case DML_TENSOR_DATA_TYPE_UINT64: return "DML_TENSOR_DATA_TYPE_UINT64"; + case DML_TENSOR_DATA_TYPE_INT64: return "DML_TENSOR_DATA_TYPE_INT64"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_TENSOR_TYPE value) +{ + switch (value) + { + case DML_TENSOR_TYPE_INVALID: return "DML_TENSOR_TYPE_INVALID"; + case DML_TENSOR_TYPE_BUFFER: return "DML_TENSOR_TYPE_BUFFER"; + default: + assert(false); + return ""; + } +} +template <> inline gsl::czstring ToString(DML_OPERATOR_TYPE value) { switch (value) @@ -2561,9 +2652,6 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_ELEMENT_WISE_ATAN: return "DML_OPERATOR_ELEMENT_WISE_ATAN"; case DML_OPERATOR_ELEMENT_WISE_CEIL: return "DML_OPERATOR_ELEMENT_WISE_CEIL"; case DML_OPERATOR_ELEMENT_WISE_CLIP: return "DML_OPERATOR_ELEMENT_WISE_CLIP"; - case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1"; - case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD"; - case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1"; case DML_OPERATOR_ELEMENT_WISE_COS: return "DML_OPERATOR_ELEMENT_WISE_COS"; case DML_OPERATOR_ELEMENT_WISE_DIVIDE: return "DML_OPERATOR_ELEMENT_WISE_DIVIDE"; case DML_OPERATOR_ELEMENT_WISE_EXP: return "DML_OPERATOR_ELEMENT_WISE_EXP"; @@ -2587,24 +2675,41 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_ELEMENT_WISE_RECIP: return "DML_OPERATOR_ELEMENT_WISE_RECIP"; case DML_OPERATOR_ELEMENT_WISE_SIN: return "DML_OPERATOR_ELEMENT_WISE_SIN"; case DML_OPERATOR_ELEMENT_WISE_SQRT: return "DML_OPERATOR_ELEMENT_WISE_SQRT"; - case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE"; - case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX"; case DML_OPERATOR_ELEMENT_WISE_SUBTRACT: return "DML_OPERATOR_ELEMENT_WISE_SUBTRACT"; case DML_OPERATOR_ELEMENT_WISE_TAN: return "DML_OPERATOR_ELEMENT_WISE_TAN"; case DML_OPERATOR_ELEMENT_WISE_THRESHOLD: return "DML_OPERATOR_ELEMENT_WISE_THRESHOLD"; case DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZE_LINEAR"; case DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR: return "DML_OPERATOR_ELEMENT_WISE_DEQUANTIZE_LINEAR"; + case DML_OPERATOR_ACTIVATION_ELU: return "DML_OPERATOR_ACTIVATION_ELU"; + case DML_OPERATOR_ACTIVATION_CELU: return "DML_OPERATOR_ACTIVATION_CELU"; + case DML_OPERATOR_ACTIVATION_HARDMAX: return "DML_OPERATOR_ACTIVATION_HARDMAX"; + case DML_OPERATOR_ACTIVATION_HARDMAX1: return "DML_OPERATOR_ACTIVATION_HARDMAX1"; + case DML_OPERATOR_ACTIVATION_HARD_SIGMOID: return "DML_OPERATOR_ACTIVATION_HARD_SIGMOID"; + case DML_OPERATOR_ACTIVATION_IDENTITY: return "DML_OPERATOR_ACTIVATION_IDENTITY"; + case DML_OPERATOR_ACTIVATION_LEAKY_RELU: return "DML_OPERATOR_ACTIVATION_LEAKY_RELU"; + case DML_OPERATOR_ACTIVATION_LINEAR: return "DML_OPERATOR_ACTIVATION_LINEAR"; + case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX"; + case DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_LOG_SOFTMAX1"; + case DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU: return "DML_OPERATOR_ACTIVATION_PARAMETERIZED_RELU"; + case DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_PARAMETRIC_SOFTPLUS"; + case DML_OPERATOR_ACTIVATION_RELU: return "DML_OPERATOR_ACTIVATION_RELU"; + case DML_OPERATOR_ACTIVATION_SCALED_ELU: return "DML_OPERATOR_ACTIVATION_SCALED_ELU"; + case DML_OPERATOR_ACTIVATION_SCALED_TANH: return "DML_OPERATOR_ACTIVATION_SCALED_TANH"; + case DML_OPERATOR_ACTIVATION_SIGMOID: return "DML_OPERATOR_ACTIVATION_SIGMOID"; + case DML_OPERATOR_ACTIVATION_SOFTMAX: return "DML_OPERATOR_ACTIVATION_SOFTMAX"; + case DML_OPERATOR_ACTIVATION_SOFTMAX1: return "DML_OPERATOR_ACTIVATION_SOFTMAX1"; + case DML_OPERATOR_ACTIVATION_SOFTPLUS: return "DML_OPERATOR_ACTIVATION_SOFTPLUS"; + case DML_OPERATOR_ACTIVATION_SOFTSIGN: return "DML_OPERATOR_ACTIVATION_SOFTSIGN"; + case DML_OPERATOR_ACTIVATION_TANH: return "DML_OPERATOR_ACTIVATION_TANH"; + case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return "DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU"; case DML_OPERATOR_CONVOLUTION: return "DML_OPERATOR_CONVOLUTION"; case DML_OPERATOR_GEMM: return "DML_OPERATOR_GEMM"; case DML_OPERATOR_REDUCE: return "DML_OPERATOR_REDUCE"; - case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN"; - case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX"; case DML_OPERATOR_AVERAGE_POOLING: return "DML_OPERATOR_AVERAGE_POOLING"; case DML_OPERATOR_AVERAGE_POOLING1: return "DML_OPERATOR_AVERAGE_POOLING1"; case DML_OPERATOR_LP_POOLING: return "DML_OPERATOR_LP_POOLING"; case DML_OPERATOR_LP_POOLING1: return "DML_OPERATOR_LP_POOLING1"; case DML_OPERATOR_MAX_POOLING: return "DML_OPERATOR_MAX_POOLING"; - case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1"; case DML_OPERATOR_ROI_POOLING: return "DML_OPERATOR_ROI_POOLING"; case DML_OPERATOR_SLICE: return "DML_OPERATOR_SLICE"; case DML_OPERATOR_CAST: return "DML_OPERATOR_CAST"; @@ -2620,18 +2725,15 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_TILE: return "DML_OPERATOR_TILE"; case DML_OPERATOR_TOP_K: return "DML_OPERATOR_TOP_K"; case DML_OPERATOR_BATCH_NORMALIZATION: return "DML_OPERATOR_BATCH_NORMALIZATION"; - case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD"; - case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD"; + case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING"; case DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION: return "DML_OPERATOR_MEAN_VARIANCE_NORMALIZATION"; case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION"; - case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD"; case DML_OPERATOR_LP_NORMALIZATION: return "DML_OPERATOR_LP_NORMALIZATION"; case DML_OPERATOR_RNN: return "DML_OPERATOR_RNN"; case DML_OPERATOR_LSTM: return "DML_OPERATOR_LSTM"; case DML_OPERATOR_GRU: return "DML_OPERATOR_GRU"; case DML_OPERATOR_ELEMENT_WISE_SIGN: return "DML_OPERATOR_ELEMENT_WISE_SIGN"; case DML_OPERATOR_ELEMENT_WISE_IS_NAN: return "DML_OPERATOR_ELEMENT_WISE_IS_NAN"; - case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE"; case DML_OPERATOR_ELEMENT_WISE_ERF: return "DML_OPERATOR_ELEMENT_WISE_ERF"; case DML_OPERATOR_ELEMENT_WISE_SINH: return "DML_OPERATOR_ELEMENT_WISE_SINH"; case DML_OPERATOR_ELEMENT_WISE_COSH: return "DML_OPERATOR_ELEMENT_WISE_COSH"; @@ -2641,6 +2743,8 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_ELEMENT_WISE_ATANH: return "DML_OPERATOR_ELEMENT_WISE_ATANH"; case DML_OPERATOR_ELEMENT_WISE_IF: return "DML_OPERATOR_ELEMENT_WISE_IF"; case DML_OPERATOR_ELEMENT_WISE_ADD1: return "DML_OPERATOR_ELEMENT_WISE_ADD1"; + case DML_OPERATOR_ACTIVATION_SHRINK: return "DML_OPERATOR_ACTIVATION_SHRINK"; + case DML_OPERATOR_MAX_POOLING1: return "DML_OPERATOR_MAX_POOLING1"; case DML_OPERATOR_MAX_UNPOOLING: return "DML_OPERATOR_MAX_UNPOOLING"; case DML_OPERATOR_DIAGONAL_MATRIX: return "DML_OPERATOR_DIAGONAL_MATRIX"; case DML_OPERATOR_SCATTER: return "DML_OPERATOR_SCATTER"; @@ -2652,10 +2756,9 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_ELEMENT_WISE_IS_INFINITY: return "DML_OPERATOR_ELEMENT_WISE_IS_INFINITY"; case DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_TRUNCATE"; case DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR: return "DML_OPERATOR_ELEMENT_WISE_MODULUS_FLOOR"; - case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT"; case DML_OPERATOR_FILL_VALUE_SEQUENCE: return "DML_OPERATOR_FILL_VALUE_SEQUENCE"; + case DML_OPERATOR_FILL_VALUE_CONSTANT: return "DML_OPERATOR_FILL_VALUE_CONSTANT"; case DML_OPERATOR_CUMULATIVE_SUMMATION: return "DML_OPERATOR_CUMULATIVE_SUMMATION"; - case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT"; case DML_OPERATOR_REVERSE_SUBSEQUENCES: return "DML_OPERATOR_REVERSE_SUBSEQUENCES"; case DML_OPERATOR_GATHER_ELEMENTS: return "DML_OPERATOR_GATHER_ELEMENTS"; case DML_OPERATOR_GATHER_ND: return "DML_OPERATOR_GATHER_ND"; @@ -2684,20 +2787,278 @@ inline gsl::czstring ToString(DML_OPERATOR_TYPE value) case DML_OPERATOR_RESAMPLE_GRAD: return "DML_OPERATOR_RESAMPLE_GRAD"; case DML_OPERATOR_SLICE_GRAD: return "DML_OPERATOR_SLICE_GRAD"; case DML_OPERATOR_ADAM_OPTIMIZER: return "DML_OPERATOR_ADAM_OPTIMIZER"; + case DML_OPERATOR_ARGMIN: return "DML_OPERATOR_ARGMIN"; + case DML_OPERATOR_ARGMAX: return "DML_OPERATOR_ARGMAX"; case DML_OPERATOR_ROI_ALIGN: return "DML_OPERATOR_ROI_ALIGN"; - case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1"; case DML_OPERATOR_GATHER_ND1: return "DML_OPERATOR_GATHER_ND1"; - case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR"; + case DML_OPERATOR_ELEMENT_WISE_ATAN_YX: return "DML_OPERATOR_ELEMENT_WISE_ATAN_YX"; + case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD"; + case DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE: return "DML_OPERATOR_ELEMENT_WISE_DIFFERENCE_SQUARE"; + case DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD: return "DML_OPERATOR_LOCAL_RESPONSE_NORMALIZATION_GRAD"; + case DML_OPERATOR_CUMULATIVE_PRODUCT: return "DML_OPERATOR_CUMULATIVE_PRODUCT"; + case DML_OPERATOR_BATCH_NORMALIZATION_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_GRAD"; + case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING_GRAD"; case DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD: return "DML_OPERATOR_ELEMENT_WISE_QUANTIZED_LINEAR_ADD"; - case DML_OPERATOR_ROI_ALIGN_GRAD: return "DML_OPERATOR_ROI_ALIGN_GRAD"; - case DML_OPERATOR_BATCH_NORMALIZATION_TRAINING: return "DML_OPERATOR_BATCH_NORMALIZATION_TRAINING"; + case DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR: return "DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR"; + case DML_OPERATOR_ROI_ALIGN1: return "DML_OPERATOR_ROI_ALIGN1"; + case DML_OPERATOR_ELEMENT_WISE_CLIP1: return "DML_OPERATOR_ELEMENT_WISE_CLIP1"; + case DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1: return "DML_OPERATOR_ELEMENT_WISE_CLIP_GRAD1"; + case DML_OPERATOR_ELEMENT_WISE_NEGATE: return "DML_OPERATOR_ELEMENT_WISE_NEGATE"; + case DML_OPERATOR_ACTIVATION_GELU: return "DML_OPERATOR_ACTIVATION_GELU"; + case DML_OPERATOR_ACTIVATION_SWISH: return "DML_OPERATOR_ACTIVATION_SWISH"; + case DML_OPERATOR_ACTIVATION_HARD_SWISH: return "DML_OPERATOR_ACTIVATION_HARD_SWISH"; case DML_OPERATOR_RESAMPLE2: return "DML_OPERATOR_RESAMPLE2"; case DML_OPERATOR_RESAMPLE_GRAD1: return "DML_OPERATOR_RESAMPLE_GRAD1"; case DML_OPERATOR_DIAGONAL_MATRIX1: return "DML_OPERATOR_DIAGONAL_MATRIX1"; case DML_OPERATOR_MULTIHEAD_ATTENTION: return "DML_OPERATOR_MULTIHEAD_ATTENTION"; + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING"; + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_BINDING_TYPE value) +{ + switch (value) + { + case DML_BINDING_TYPE_NONE: return "DML_BINDING_TYPE_NONE"; + case DML_BINDING_TYPE_BUFFER: return "DML_BINDING_TYPE_BUFFER"; + case DML_BINDING_TYPE_BUFFER_ARRAY: return "DML_BINDING_TYPE_BUFFER_ARRAY"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_REDUCE_FUNCTION value) +{ + switch (value) + { + case DML_REDUCE_FUNCTION_ARGMAX: return "DML_REDUCE_FUNCTION_ARGMAX"; + case DML_REDUCE_FUNCTION_ARGMIN: return "DML_REDUCE_FUNCTION_ARGMIN"; + case DML_REDUCE_FUNCTION_AVERAGE: return "DML_REDUCE_FUNCTION_AVERAGE"; + case DML_REDUCE_FUNCTION_L1: return "DML_REDUCE_FUNCTION_L1"; + case DML_REDUCE_FUNCTION_L2: return "DML_REDUCE_FUNCTION_L2"; + case DML_REDUCE_FUNCTION_LOG_SUM: return "DML_REDUCE_FUNCTION_LOG_SUM"; + case DML_REDUCE_FUNCTION_LOG_SUM_EXP: return "DML_REDUCE_FUNCTION_LOG_SUM_EXP"; + case DML_REDUCE_FUNCTION_MAX: return "DML_REDUCE_FUNCTION_MAX"; + case DML_REDUCE_FUNCTION_MIN: return "DML_REDUCE_FUNCTION_MIN"; + case DML_REDUCE_FUNCTION_MULTIPLY: return "DML_REDUCE_FUNCTION_MULTIPLY"; + case DML_REDUCE_FUNCTION_SUM: return "DML_REDUCE_FUNCTION_SUM"; + case DML_REDUCE_FUNCTION_SUM_SQUARE: return "DML_REDUCE_FUNCTION_SUM_SQUARE"; default: assert(false); return ""; } } + +template <> +inline gsl::czstring ToString(DML_MATRIX_TRANSFORM value) +{ + switch (value) + { + case DML_MATRIX_TRANSFORM_NONE: return "DML_MATRIX_TRANSFORM_NONE"; + case DML_MATRIX_TRANSFORM_TRANSPOSE: return "DML_MATRIX_TRANSFORM_TRANSPOSE"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_CONVOLUTION_MODE value) +{ + switch (value) + { + case DML_CONVOLUTION_MODE_CONVOLUTION: return "DML_CONVOLUTION_MODE_CONVOLUTION"; + case DML_CONVOLUTION_MODE_CROSS_CORRELATION: return "DML_CONVOLUTION_MODE_CROSS_CORRELATION"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_CONVOLUTION_DIRECTION value) +{ + switch (value) + { + case DML_CONVOLUTION_DIRECTION_FORWARD: return "DML_CONVOLUTION_DIRECTION_FORWARD"; + case DML_CONVOLUTION_DIRECTION_BACKWARD: return "DML_CONVOLUTION_DIRECTION_BACKWARD"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_PADDING_MODE value) +{ + switch (value) + { + case DML_PADDING_MODE_CONSTANT: return "DML_PADDING_MODE_CONSTANT"; + case DML_PADDING_MODE_EDGE: return "DML_PADDING_MODE_EDGE"; + case DML_PADDING_MODE_REFLECTION: return "DML_PADDING_MODE_REFLECTION"; + case DML_PADDING_MODE_SYMMETRIC: return "DML_PADDING_MODE_SYMMETRIC"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_INTERPOLATION_MODE value) +{ + switch (value) + { + case DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR: return "DML_INTERPOLATION_MODE_NEAREST_NEIGHBOR"; + case DML_INTERPOLATION_MODE_LINEAR: return "DML_INTERPOLATION_MODE_LINEAR"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_RECURRENT_NETWORK_DIRECTION value) +{ + switch (value) + { + case DML_RECURRENT_NETWORK_DIRECTION_FORWARD: return "DML_RECURRENT_NETWORK_DIRECTION_FORWARD"; + case DML_RECURRENT_NETWORK_DIRECTION_BACKWARD: return "DML_RECURRENT_NETWORK_DIRECTION_BACKWARD"; + case DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL: return "DML_RECURRENT_NETWORK_DIRECTION_BIDIRECTIONAL"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_FEATURE value) +{ + switch (value) + { + case DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT: return "DML_FEATURE_TENSOR_DATA_TYPE_SUPPORT"; + case DML_FEATURE_FEATURE_LEVELS: return "DML_FEATURE_FEATURE_LEVELS"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_FEATURE_LEVEL value) +{ + switch (value) + { + case DML_FEATURE_LEVEL_1_0: return "DML_FEATURE_LEVEL_1_0"; + case DML_FEATURE_LEVEL_2_0: return "DML_FEATURE_LEVEL_2_0"; + case DML_FEATURE_LEVEL_2_1: return "DML_FEATURE_LEVEL_2_1"; + case DML_FEATURE_LEVEL_3_0: return "DML_FEATURE_LEVEL_3_0"; + case DML_FEATURE_LEVEL_3_1: return "DML_FEATURE_LEVEL_3_1"; + case DML_FEATURE_LEVEL_4_0: return "DML_FEATURE_LEVEL_4_0"; + case DML_FEATURE_LEVEL_4_1: return "DML_FEATURE_LEVEL_4_1"; + case DML_FEATURE_LEVEL_5_0: return "DML_FEATURE_LEVEL_5_0"; + case DML_FEATURE_LEVEL_5_1: return "DML_FEATURE_LEVEL_5_1"; + case DML_FEATURE_LEVEL_5_2: return "DML_FEATURE_LEVEL_5_2"; + case DML_FEATURE_LEVEL_6_0: return "DML_FEATURE_LEVEL_6_0"; + case DML_FEATURE_LEVEL_6_1: return "DML_FEATURE_LEVEL_6_1"; + case DML_FEATURE_LEVEL_6_2: return "DML_FEATURE_LEVEL_6_2"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_IS_INFINITY_MODE value) +{ + switch (value) + { + case DML_IS_INFINITY_MODE_EITHER: return "DML_IS_INFINITY_MODE_EITHER"; + case DML_IS_INFINITY_MODE_POSITIVE: return "DML_IS_INFINITY_MODE_POSITIVE"; + case DML_IS_INFINITY_MODE_NEGATIVE: return "DML_IS_INFINITY_MODE_NEGATIVE"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_DEPTH_SPACE_ORDER value) +{ + switch (value) + { + case DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW: return "DML_DEPTH_SPACE_ORDER_DEPTH_COLUMN_ROW"; + case DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH: return "DML_DEPTH_SPACE_ORDER_COLUMN_ROW_DEPTH"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_AXIS_DIRECTION value) +{ + switch (value) + { + case DML_AXIS_DIRECTION_INCREASING: return "DML_AXIS_DIRECTION_INCREASING"; + case DML_AXIS_DIRECTION_DECREASING: return "DML_AXIS_DIRECTION_DECREASING"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_ROUNDING_MODE value) +{ + switch (value) + { + case DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN: return "DML_ROUNDING_MODE_HALVES_TO_NEAREST_EVEN"; + case DML_ROUNDING_MODE_TOWARD_ZERO: return "DML_ROUNDING_MODE_TOWARD_ZERO"; + case DML_ROUNDING_MODE_TOWARD_INFINITY: return "DML_ROUNDING_MODE_TOWARD_INFINITY"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_RANDOM_GENERATOR_TYPE value) +{ + switch (value) + { + case DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10: return "DML_RANDOM_GENERATOR_TYPE_PHILOX_4X32_10"; + default: + assert(false); + return ""; + } +} + +template <> +inline gsl::czstring ToString(DML_MULTIHEAD_ATTENTION_MASK_TYPE value) +{ + switch (value) + { + case DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_NONE"; + case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_LENGTH"; + case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_SEQUENCE_END_START"; + case DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_KEY_QUERY_SEQUENCE_LENGTH_START_END"; + case DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN: return "DML_MULTIHEAD_ATTENTION_MASK_TYPE_BOOLEAN"; + default: + assert(false); + return ""; + } +} + + +template +T FromString(std::string_view value); + +} } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h index 2a82c12872a72..5fe6603c2a0bf 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h @@ -618,7 +618,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_THRESHOLD_OPERATOR_SCHEMA { constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, }; @@ -633,7 +633,7 @@ constexpr DML_OPERATOR_SCHEMA DML_ELEMENT_WISE_QUANTIZE_LINEAR_OPERATOR_SCHEMA { constexpr DML_SCHEMA_FIELD DML_ELEMENT_WISE_DEQUANTIZE_LINEAR_OPERATOR_SCHEMA_FIELDS[4] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ZeroPointTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, }; @@ -869,31 +869,6 @@ constexpr DML_OPERATOR_SCHEMA DML_ROI_POOLING_OPERATOR_SCHEMA { DML_ROI_POOLING_OPERATOR_SCHEMA_FIELDS, }; - -constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] { - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false }, -}; - -constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA { - "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", - static_cast(DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING), - DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, - 13, - DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS, -}; - constexpr DML_SCHEMA_FIELD DML_SLICE_OPERATOR_SCHEMA_FIELDS[6] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, @@ -1146,7 +1121,7 @@ constexpr DML_SCHEMA_FIELD DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputGradientTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleGradientTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputBiasGradientTensor", false }, - DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Epsilon", false }, }; constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_GRAD_OPERATOR_SCHEMA { @@ -2312,7 +2287,7 @@ constexpr DML_OPERATOR_SCHEMA DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA { DML_BATCH_NORMALIZATION_TRAINING_OPERATOR_SCHEMA_FIELDS, }; -constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{ +constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "InterpolationMode", false }, @@ -2323,7 +2298,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE2_OPERATOR_SCHEMA_FIELDS[8]{ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false }, }; -constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA{ +constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE2_OPERATOR_SCHEMA { "DML_OPERATOR_RESAMPLE2", DML_OPERATOR_RESAMPLE2, DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, @@ -2342,7 +2317,7 @@ constexpr DML_SCHEMA_FIELD DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS[8]{ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY, "OutputPixelOffsets", false }, }; -constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{ +constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA { "DML_OPERATOR_RESAMPLE_GRAD1", DML_OPERATOR_RESAMPLE_GRAD1, DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, @@ -2350,7 +2325,7 @@ constexpr DML_OPERATOR_SCHEMA DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA{ DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA_FIELDS, }; -constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{ +constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", true }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "ValueDataType", false }, @@ -2359,7 +2334,7 @@ constexpr DML_SCHEMA_FIELD DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA_FIELDS[6]{ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_INT, "DiagonalFillEnd", false }, }; -constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA{ +constexpr DML_OPERATOR_SCHEMA DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA { "DML_OPERATOR_DIAGONAL_MATRIX1", DML_OPERATOR_DIAGONAL_MATRIX1, DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, @@ -2396,6 +2371,48 @@ constexpr DML_OPERATOR_SCHEMA DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA { DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA_FIELDS, }; +constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS[13] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "DimensionCount", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Strides", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "WindowSize", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "StartPadding", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "EndPadding", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT_ARRAY, "Dilations", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_UINT, "IncludePadding", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA { + "DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING", + DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING, + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 13, + DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA_FIELDS, +}; + +constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA { + "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT", + DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 8, + DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS, +}; constexpr DML_SCHEMA_FIELD DML_ACTIVATION_ELU_OPERATOR_SCHEMA_FIELDS[3] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, @@ -2732,6 +2749,35 @@ constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_GELU_OPERATOR_SCHEMA { DML_ACTIVATION_GELU_OPERATOR_SCHEMA_FIELDS, }; +constexpr DML_SCHEMA_FIELD DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS[3] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "SigmoidInputScale", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_SWISH_OPERATOR_SCHEMA { + "DML_OPERATOR_ACTIVATION_SWISH", + DML_OPERATOR_ACTIVATION_SWISH, + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 3, + DML_ACTIVATION_SWISH_OPERATOR_SCHEMA_FIELDS, +}; + +constexpr DML_SCHEMA_FIELD DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS[4] { + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Alpha", false }, + DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_ATTRIBUTE, DML_SCHEMA_FIELD_TYPE_FLOAT, "Beta", false }, +}; + +constexpr DML_OPERATOR_SCHEMA DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA { + "DML_OPERATOR_ACTIVATION_HARD_SWISH", + DML_OPERATOR_ACTIVATION_HARD_SWISH, + DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE, + 4, + DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA_FIELDS, +}; + constexpr DML_SCHEMA_FIELD DML_RNN_ZERO_OPERATOR_SCHEMA_FIELDS[3] { DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false }, DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "SequenceLengthsTensor", false }, diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h new file mode 100644 index 0000000000000..72059b9a3f911 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h @@ -0,0 +1,788 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_ +#define FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_ + +#include "flatbuffers/flatbuffers.h" + +#include "OperatorFieldTypes_generated.h" + +namespace dml { +namespace ir { + +struct ConstantRawData; +struct ConstantRawDataBuilder; + +struct ConstantName; +struct ConstantNameBuilder; + +struct ConstantNodeDesc; +struct ConstantNodeDescBuilder; + +struct DmlBufferTensorDesc; +struct DmlBufferTensorDescBuilder; + +struct OperatorNodeDesc; +struct OperatorNodeDescBuilder; + +struct DmlGraphNode; +struct DmlGraphNodeBuilder; + +struct DmlGraphDesc; +struct DmlGraphDescBuilder; + +enum ConstantNodeDescDetail { + ConstantNodeDescDetail_NONE = 0, + ConstantNodeDescDetail_ConstantName = 1, + ConstantNodeDescDetail_ConstantRawData = 2, + ConstantNodeDescDetail_MIN = ConstantNodeDescDetail_NONE, + ConstantNodeDescDetail_MAX = ConstantNodeDescDetail_ConstantRawData +}; + +inline const ConstantNodeDescDetail (&EnumValuesConstantNodeDescDetail())[3] { + static const ConstantNodeDescDetail values[] = { + ConstantNodeDescDetail_NONE, + ConstantNodeDescDetail_ConstantName, + ConstantNodeDescDetail_ConstantRawData + }; + return values; +} + +inline const char * const *EnumNamesConstantNodeDescDetail() { + static const char * const names[4] = { + "NONE", + "ConstantName", + "ConstantRawData", + nullptr + }; + return names; +} + +inline const char *EnumNameConstantNodeDescDetail(ConstantNodeDescDetail e) { + if (flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return ""; + const size_t index = static_cast(e); + return EnumNamesConstantNodeDescDetail()[index]; +} + +template struct ConstantNodeDescDetailTraits { + static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_NONE; +}; + +template<> struct ConstantNodeDescDetailTraits { + static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantName; +}; + +template<> struct ConstantNodeDescDetailTraits { + static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantRawData; +}; + +bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type); +bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); + +enum NodeDesc { + NodeDesc_NONE = 0, + NodeDesc_OperatorNodeDesc = 1, + NodeDesc_ConstantNodeDesc = 2, + NodeDesc_MIN = NodeDesc_NONE, + NodeDesc_MAX = NodeDesc_ConstantNodeDesc +}; + +inline const NodeDesc (&EnumValuesNodeDesc())[3] { + static const NodeDesc values[] = { + NodeDesc_NONE, + NodeDesc_OperatorNodeDesc, + NodeDesc_ConstantNodeDesc + }; + return values; +} + +inline const char * const *EnumNamesNodeDesc() { + static const char * const names[4] = { + "NONE", + "OperatorNodeDesc", + "ConstantNodeDesc", + nullptr + }; + return names; +} + +inline const char *EnumNameNodeDesc(NodeDesc e) { + if (flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return ""; + const size_t index = static_cast(e); + return EnumNamesNodeDesc()[index]; +} + +template struct NodeDescTraits { + static const NodeDesc enum_value = NodeDesc_NONE; +}; + +template<> struct NodeDescTraits { + static const NodeDesc enum_value = NodeDesc_OperatorNodeDesc; +}; + +template<> struct NodeDescTraits { + static const NodeDesc enum_value = NodeDesc_ConstantNodeDesc; +}; + +bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type); +bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); + +struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ConstantRawDataBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA = 4 + }; + const flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATA) && + verifier.VerifyVector(data()) && + verifier.EndTable(); + } +}; + +struct ConstantRawDataBuilder { + typedef ConstantRawData Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data(flatbuffers::Offset> data) { + fbb_.AddOffset(ConstantRawData::VT_DATA, data); + } + explicit ConstantRawDataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ConstantRawDataBuilder &operator=(const ConstantRawDataBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateConstantRawData( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> data = 0) { + ConstantRawDataBuilder builder_(_fbb); + builder_.add_data(data); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateConstantRawDataDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *data = nullptr) { + auto data__ = data ? _fbb.CreateVector(*data) : 0; + return dml::ir::CreateConstantRawData( + _fbb, + data__); +} + +struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ConstantNameBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_NAME = 4 + }; + const flatbuffers::String *name() const { + return GetPointer(VT_NAME); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_NAME) && + verifier.VerifyString(name()) && + verifier.EndTable(); + } +}; + +struct ConstantNameBuilder { + typedef ConstantName Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_name(flatbuffers::Offset name) { + fbb_.AddOffset(ConstantName::VT_NAME, name); + } + explicit ConstantNameBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ConstantNameBuilder &operator=(const ConstantNameBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateConstantName( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset name = 0) { + ConstantNameBuilder builder_(_fbb); + builder_.add_name(name); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateConstantNameDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *name = nullptr) { + auto name__ = name ? _fbb.CreateString(name) : 0; + return dml::ir::CreateConstantName( + _fbb, + name__); +} + +struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ConstantNodeDescBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA_TYPE = 4, + VT_DATA = 6 + }; + dml::ir::ConstantNodeDescDetail data_type() const { + return static_cast(GetField(VT_DATA_TYPE, 0)); + } + const void *data() const { + return GetPointer(VT_DATA); + } + template const T *data_as() const; + const dml::ir::ConstantName *data_as_ConstantName() const { + return data_type() == dml::ir::ConstantNodeDescDetail_ConstantName ? static_cast(data()) : nullptr; + } + const dml::ir::ConstantRawData *data_as_ConstantRawData() const { + return data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData ? static_cast(data()) : nullptr; + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_DATA_TYPE) && + VerifyOffset(verifier, VT_DATA) && + VerifyConstantNodeDescDetail(verifier, data(), data_type()) && + verifier.EndTable(); + } +}; + +template<> inline const dml::ir::ConstantName *ConstantNodeDesc::data_as() const { + return data_as_ConstantName(); +} + +template<> inline const dml::ir::ConstantRawData *ConstantNodeDesc::data_as() const { + return data_as_ConstantRawData(); +} + +struct ConstantNodeDescBuilder { + typedef ConstantNodeDesc Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data_type(dml::ir::ConstantNodeDescDetail data_type) { + fbb_.AddElement(ConstantNodeDesc::VT_DATA_TYPE, static_cast(data_type), 0); + } + void add_data(flatbuffers::Offset data) { + fbb_.AddOffset(ConstantNodeDesc::VT_DATA, data); + } + explicit ConstantNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ConstantNodeDescBuilder &operator=(const ConstantNodeDescBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateConstantNodeDesc( + flatbuffers::FlatBufferBuilder &_fbb, + dml::ir::ConstantNodeDescDetail data_type = dml::ir::ConstantNodeDescDetail_NONE, + flatbuffers::Offset data = 0) { + ConstantNodeDescBuilder builder_(_fbb); + builder_.add_data(data); + builder_.add_data_type(data_type); + return builder_.Finish(); +} + +struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DmlBufferTensorDescBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATATYPE = 4, + VT_SIZES = 6, + VT_STRIDES = 8, + VT_TOTALTENSORSIZEINBYTES = 10 + }; + const flatbuffers::String *dataType() const { + return GetPointer(VT_DATATYPE); + } + const flatbuffers::Vector *sizes() const { + return GetPointer *>(VT_SIZES); + } + const flatbuffers::Vector *strides() const { + return GetPointer *>(VT_STRIDES); + } + uint64_t totalTensorSizeInBytes() const { + return GetField(VT_TOTALTENSORSIZEINBYTES, 0); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATATYPE) && + verifier.VerifyString(dataType()) && + VerifyOffset(verifier, VT_SIZES) && + verifier.VerifyVector(sizes()) && + VerifyOffset(verifier, VT_STRIDES) && + verifier.VerifyVector(strides()) && + VerifyField(verifier, VT_TOTALTENSORSIZEINBYTES) && + verifier.EndTable(); + } +}; + +struct DmlBufferTensorDescBuilder { + typedef DmlBufferTensorDesc Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_dataType(flatbuffers::Offset dataType) { + fbb_.AddOffset(DmlBufferTensorDesc::VT_DATATYPE, dataType); + } + void add_sizes(flatbuffers::Offset> sizes) { + fbb_.AddOffset(DmlBufferTensorDesc::VT_SIZES, sizes); + } + void add_strides(flatbuffers::Offset> strides) { + fbb_.AddOffset(DmlBufferTensorDesc::VT_STRIDES, strides); + } + void add_totalTensorSizeInBytes(uint64_t totalTensorSizeInBytes) { + fbb_.AddElement(DmlBufferTensorDesc::VT_TOTALTENSORSIZEINBYTES, totalTensorSizeInBytes, 0); + } + explicit DmlBufferTensorDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DmlBufferTensorDescBuilder &operator=(const DmlBufferTensorDescBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDmlBufferTensorDesc( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset dataType = 0, + flatbuffers::Offset> sizes = 0, + flatbuffers::Offset> strides = 0, + uint64_t totalTensorSizeInBytes = 0) { + DmlBufferTensorDescBuilder builder_(_fbb); + builder_.add_totalTensorSizeInBytes(totalTensorSizeInBytes); + builder_.add_strides(strides); + builder_.add_sizes(sizes); + builder_.add_dataType(dataType); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateDmlBufferTensorDescDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *dataType = nullptr, + const std::vector *sizes = nullptr, + const std::vector *strides = nullptr, + uint64_t totalTensorSizeInBytes = 0) { + auto dataType__ = dataType ? _fbb.CreateString(dataType) : 0; + auto sizes__ = sizes ? _fbb.CreateVector(*sizes) : 0; + auto strides__ = strides ? _fbb.CreateVector(*strides) : 0; + return dml::ir::CreateDmlBufferTensorDesc( + _fbb, + dataType__, + sizes__, + strides__, + totalTensorSizeInBytes); +} + +struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef OperatorNodeDescBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_TYPE = 4, + VT_INPUTS = 6, + VT_OUTPUTS = 8, + VT_ATTRIBUTES = 10 + }; + const flatbuffers::String *type() const { + return GetPointer(VT_TYPE); + } + const flatbuffers::Vector> *inputs() const { + return GetPointer> *>(VT_INPUTS); + } + const flatbuffers::Vector> *outputs() const { + return GetPointer> *>(VT_OUTPUTS); + } + const flatbuffers::Vector> *attributes() const { + return GetPointer> *>(VT_ATTRIBUTES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_TYPE) && + verifier.VerifyString(type()) && + VerifyOffset(verifier, VT_INPUTS) && + verifier.VerifyVector(inputs()) && + verifier.VerifyVectorOfTables(inputs()) && + VerifyOffset(verifier, VT_OUTPUTS) && + verifier.VerifyVector(outputs()) && + verifier.VerifyVectorOfTables(outputs()) && + VerifyOffset(verifier, VT_ATTRIBUTES) && + verifier.VerifyVector(attributes()) && + verifier.VerifyVectorOfTables(attributes()) && + verifier.EndTable(); + } +}; + +struct OperatorNodeDescBuilder { + typedef OperatorNodeDesc Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_type(flatbuffers::Offset type) { + fbb_.AddOffset(OperatorNodeDesc::VT_TYPE, type); + } + void add_inputs(flatbuffers::Offset>> inputs) { + fbb_.AddOffset(OperatorNodeDesc::VT_INPUTS, inputs); + } + void add_outputs(flatbuffers::Offset>> outputs) { + fbb_.AddOffset(OperatorNodeDesc::VT_OUTPUTS, outputs); + } + void add_attributes(flatbuffers::Offset>> attributes) { + fbb_.AddOffset(OperatorNodeDesc::VT_ATTRIBUTES, attributes); + } + explicit OperatorNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + OperatorNodeDescBuilder &operator=(const OperatorNodeDescBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateOperatorNodeDesc( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset type = 0, + flatbuffers::Offset>> inputs = 0, + flatbuffers::Offset>> outputs = 0, + flatbuffers::Offset>> attributes = 0) { + OperatorNodeDescBuilder builder_(_fbb); + builder_.add_attributes(attributes); + builder_.add_outputs(outputs); + builder_.add_inputs(inputs); + builder_.add_type(type); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateOperatorNodeDescDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *type = nullptr, + const std::vector> *inputs = nullptr, + const std::vector> *outputs = nullptr, + const std::vector> *attributes = nullptr) { + auto type__ = type ? _fbb.CreateString(type) : 0; + auto inputs__ = inputs ? _fbb.CreateVector>(*inputs) : 0; + auto outputs__ = outputs ? _fbb.CreateVector>(*outputs) : 0; + auto attributes__ = attributes ? _fbb.CreateVector>(*attributes) : 0; + return dml::ir::CreateOperatorNodeDesc( + _fbb, + type__, + inputs__, + outputs__, + attributes__); +} + +struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DmlGraphNodeBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DESC_TYPE = 4, + VT_DESC = 6, + VT_NAME = 8, + VT_INPUTNAMES = 10, + VT_OUTPUTNAMES = 12 + }; + dml::ir::NodeDesc desc_type() const { + return static_cast(GetField(VT_DESC_TYPE, 0)); + } + const void *desc() const { + return GetPointer(VT_DESC); + } + template const T *desc_as() const; + const dml::ir::OperatorNodeDesc *desc_as_OperatorNodeDesc() const { + return desc_type() == dml::ir::NodeDesc_OperatorNodeDesc ? static_cast(desc()) : nullptr; + } + const dml::ir::ConstantNodeDesc *desc_as_ConstantNodeDesc() const { + return desc_type() == dml::ir::NodeDesc_ConstantNodeDesc ? static_cast(desc()) : nullptr; + } + const flatbuffers::String *name() const { + return GetPointer(VT_NAME); + } + const flatbuffers::Vector> *inputNames() const { + return GetPointer> *>(VT_INPUTNAMES); + } + const flatbuffers::Vector> *outputNames() const { + return GetPointer> *>(VT_OUTPUTNAMES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_DESC_TYPE) && + VerifyOffset(verifier, VT_DESC) && + VerifyNodeDesc(verifier, desc(), desc_type()) && + VerifyOffset(verifier, VT_NAME) && + verifier.VerifyString(name()) && + VerifyOffset(verifier, VT_INPUTNAMES) && + verifier.VerifyVector(inputNames()) && + verifier.VerifyVectorOfStrings(inputNames()) && + VerifyOffset(verifier, VT_OUTPUTNAMES) && + verifier.VerifyVector(outputNames()) && + verifier.VerifyVectorOfStrings(outputNames()) && + verifier.EndTable(); + } +}; + +template<> inline const dml::ir::OperatorNodeDesc *DmlGraphNode::desc_as() const { + return desc_as_OperatorNodeDesc(); +} + +template<> inline const dml::ir::ConstantNodeDesc *DmlGraphNode::desc_as() const { + return desc_as_ConstantNodeDesc(); +} + +struct DmlGraphNodeBuilder { + typedef DmlGraphNode Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_desc_type(dml::ir::NodeDesc desc_type) { + fbb_.AddElement(DmlGraphNode::VT_DESC_TYPE, static_cast(desc_type), 0); + } + void add_desc(flatbuffers::Offset desc) { + fbb_.AddOffset(DmlGraphNode::VT_DESC, desc); + } + void add_name(flatbuffers::Offset name) { + fbb_.AddOffset(DmlGraphNode::VT_NAME, name); + } + void add_inputNames(flatbuffers::Offset>> inputNames) { + fbb_.AddOffset(DmlGraphNode::VT_INPUTNAMES, inputNames); + } + void add_outputNames(flatbuffers::Offset>> outputNames) { + fbb_.AddOffset(DmlGraphNode::VT_OUTPUTNAMES, outputNames); + } + explicit DmlGraphNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DmlGraphNodeBuilder &operator=(const DmlGraphNodeBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDmlGraphNode( + flatbuffers::FlatBufferBuilder &_fbb, + dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE, + flatbuffers::Offset desc = 0, + flatbuffers::Offset name = 0, + flatbuffers::Offset>> inputNames = 0, + flatbuffers::Offset>> outputNames = 0) { + DmlGraphNodeBuilder builder_(_fbb); + builder_.add_outputNames(outputNames); + builder_.add_inputNames(inputNames); + builder_.add_name(name); + builder_.add_desc(desc); + builder_.add_desc_type(desc_type); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateDmlGraphNodeDirect( + flatbuffers::FlatBufferBuilder &_fbb, + dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE, + flatbuffers::Offset desc = 0, + const char *name = nullptr, + const std::vector> *inputNames = nullptr, + const std::vector> *outputNames = nullptr) { + auto name__ = name ? _fbb.CreateString(name) : 0; + auto inputNames__ = inputNames ? _fbb.CreateVector>(*inputNames) : 0; + auto outputNames__ = outputNames ? _fbb.CreateVector>(*outputNames) : 0; + return dml::ir::CreateDmlGraphNode( + _fbb, + desc_type, + desc, + name__, + inputNames__, + outputNames__); +} + +struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DmlGraphDescBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_NODES = 4, + VT_GRAPHINPUTNAMES = 6, + VT_GRAPHOUTPUTNAMES = 8 + }; + const flatbuffers::Vector> *nodes() const { + return GetPointer> *>(VT_NODES); + } + const flatbuffers::Vector> *graphInputNames() const { + return GetPointer> *>(VT_GRAPHINPUTNAMES); + } + const flatbuffers::Vector> *graphOutputNames() const { + return GetPointer> *>(VT_GRAPHOUTPUTNAMES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_NODES) && + verifier.VerifyVector(nodes()) && + verifier.VerifyVectorOfTables(nodes()) && + VerifyOffset(verifier, VT_GRAPHINPUTNAMES) && + verifier.VerifyVector(graphInputNames()) && + verifier.VerifyVectorOfStrings(graphInputNames()) && + VerifyOffset(verifier, VT_GRAPHOUTPUTNAMES) && + verifier.VerifyVector(graphOutputNames()) && + verifier.VerifyVectorOfStrings(graphOutputNames()) && + verifier.EndTable(); + } +}; + +struct DmlGraphDescBuilder { + typedef DmlGraphDesc Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_nodes(flatbuffers::Offset>> nodes) { + fbb_.AddOffset(DmlGraphDesc::VT_NODES, nodes); + } + void add_graphInputNames(flatbuffers::Offset>> graphInputNames) { + fbb_.AddOffset(DmlGraphDesc::VT_GRAPHINPUTNAMES, graphInputNames); + } + void add_graphOutputNames(flatbuffers::Offset>> graphOutputNames) { + fbb_.AddOffset(DmlGraphDesc::VT_GRAPHOUTPUTNAMES, graphOutputNames); + } + explicit DmlGraphDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DmlGraphDescBuilder &operator=(const DmlGraphDescBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDmlGraphDesc( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset>> nodes = 0, + flatbuffers::Offset>> graphInputNames = 0, + flatbuffers::Offset>> graphOutputNames = 0) { + DmlGraphDescBuilder builder_(_fbb); + builder_.add_graphOutputNames(graphOutputNames); + builder_.add_graphInputNames(graphInputNames); + builder_.add_nodes(nodes); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateDmlGraphDescDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector> *nodes = nullptr, + const std::vector> *graphInputNames = nullptr, + const std::vector> *graphOutputNames = nullptr) { + auto nodes__ = nodes ? _fbb.CreateVector>(*nodes) : 0; + auto graphInputNames__ = graphInputNames ? _fbb.CreateVector>(*graphInputNames) : 0; + auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector>(*graphOutputNames) : 0; + return dml::ir::CreateDmlGraphDesc( + _fbb, + nodes__, + graphInputNames__, + graphOutputNames__); +} + +inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) { + switch (type) { + case ConstantNodeDescDetail_NONE: { + return true; + } + case ConstantNodeDescDetail_ConstantName: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case ConstantNodeDescDetail_ConstantRawData: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + default: return true; + } +} + +inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyConstantNodeDescDetail( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) { + switch (type) { + case NodeDesc_NONE: { + return true; + } + case NodeDesc_OperatorNodeDesc: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case NodeDesc_ConstantNodeDesc: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + default: return true; + } +} + +inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyNodeDesc( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +inline const dml::ir::DmlGraphDesc *GetDmlGraphDesc(const void *buf) { + return flatbuffers::GetRoot(buf); +} + +inline const dml::ir::DmlGraphDesc *GetSizePrefixedDmlGraphDesc(const void *buf) { + return flatbuffers::GetSizePrefixedRoot(buf); +} + +inline bool VerifyDmlGraphDescBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifyBuffer(nullptr); +} + +inline bool VerifySizePrefixedDmlGraphDescBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifySizePrefixedBuffer(nullptr); +} + +inline void FinishDmlGraphDescBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.Finish(root); +} + +inline void FinishSizePrefixedDmlGraphDescBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.FinishSizePrefixed(root); +} + +} // namespace ir +} // namespace dml + +#endif // FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_ diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h new file mode 100644 index 0000000000000..9decf0dce1bb2 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDeserialization.h @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. + +#pragma once +#include "DmlSerializedGraphDesc.h" + +struct NodeIndex +{ + uint32_t nodeIndex; + uint32_t nodeOutputIndex; +}; + +DmlSerializedGraphDesc DeserializeDmlGraph( + const uint8_t* flatbufferGraphDescBlob, + /*out*/ std::vector>& rawData); \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h new file mode 100644 index 0000000000000..d8d069da906b7 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphSerialization.h @@ -0,0 +1,8 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. + +#pragma once +#include "DmlGraphDesc_generated.h" + +struct DmlSerializedGraphDesc; + +flatbuffers::DetachedBuffer SerializeDmlGraph(const DmlSerializedGraphDesc& graphDesc); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h new file mode 100644 index 0000000000000..51c3d6c81244b --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlSerializedGraphDesc.h @@ -0,0 +1,73 @@ +//----------------------------------------------------------------------------- +// +// Copyright (c) Microsoft Corporation. All rights reserved. +// +//----------------------------------------------------------------------------- + +#pragma once + +struct ConstantName +{ + std::string name; +}; + +struct ConstantData +{ + std::byte* data; + uint64_t dataSize; +}; + +using DmlSerializedGraphNodeConstantVariant = std::variant< + ConstantName, + ConstantData +>; + +using DmlSerializedGraphNodeDescVariant = std::variant< + AbstractOperatorDesc, + DmlSerializedGraphNodeConstantVariant +>; + +struct DmlSerializedGraphNode +{ + DmlSerializedGraphNodeDescVariant Desc; + std::string Name; +}; + +struct DmlInputSerializedGraphEdge +{ + uint32_t GraphInputIndex; + uint32_t ToNodeIndex; + uint32_t ToNodeInputIndex; + std::string Name; +}; + +struct DmlOutputSerializedGraphEdge +{ + uint32_t FromNodeIndex; + uint32_t FromNodeOutputIndex; + uint32_t GraphOutputIndex; + std::string Name; +}; + +struct DmlIntermediateSerializedGraphEdge +{ + uint32_t FromNodeIndex; + uint32_t FromNodeOutputIndex; + uint32_t ToNodeIndex; + uint32_t ToNodeInputIndex; + std::string Name; +}; + +struct DmlSerializedGraphDesc +{ + uint32_t InputCount; + uint32_t OutputCount; + // nodes must be present in topological order for deserialization to work + // because while creating a intermediate edge during deserialization, node (from + // which given intermediate edge is outputting) must be visited before than the node + // (to which given intermediate edge is inputting) + std::vector Nodes; + std::vector InputEdges; + std::vector OutputEdges; + std::vector IntermediateEdges; +}; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h index 99218c135f058..4be41ad3924a2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaHelpers.h @@ -425,7 +425,6 @@ inline std::vector GetFields(const DML_AVERAGE_POOLING_OPERATOR_D OperatorField(&DML_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.IncludePadding))), }; } - inline std::vector GetFields(const DML_AVERAGE_POOLING1_OPERATOR_DESC& desc) { return { @@ -502,24 +501,6 @@ inline std::vector GetFields(const DML_ROI_POOLING_OPERATOR_DESC& OperatorField(&DML_ROI_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.PooledSize))), }; } -inline std::vector GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc) -{ - return { - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.InputTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.InputScaleTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.InputZeroPointTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.OutputScaleTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.OutputZeroPointTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.OutputTensor))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.DimensionCount))), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.Strides), desc.DimensionCount)), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast(desc.WindowSize), desc.DimensionCount)), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast(desc.StartPadding), desc.DimensionCount)), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast(desc.EndPadding), desc.DimensionCount)), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast(desc.Dilations), desc.DimensionCount)), - OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast(desc.IncludePadding))), - }; -} inline std::vector GetFields(const DML_SLICE_OPERATOR_DESC& desc) { return { @@ -1488,6 +1469,37 @@ inline std::vector GetFields(const DML_MULTIHEAD_ATTENTION_OPERAT OperatorField(&DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA.Fields[17], ToOperatorFieldType(static_cast(desc.MaskType))), }; } +inline std::vector GetFields(const DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.InputTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.InputScaleTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.InputZeroPointTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.OutputScaleTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.OutputZeroPointTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.OutputTensor))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.DimensionCount))), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.Strides), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[8], ToOperatorFieldType(static_cast(desc.WindowSize), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[9], ToOperatorFieldType(static_cast(desc.StartPadding), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[10], ToOperatorFieldType(static_cast(desc.EndPadding), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[11], ToOperatorFieldType(static_cast(desc.Dilations), desc.DimensionCount)), + OperatorField(&DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA.Fields[12], ToOperatorFieldType(static_cast(desc.IncludePadding))), + }; +} +inline std::vector GetFields(const DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.ATensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.AScaleTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.AZeroPointTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.BTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[4], ToOperatorFieldType(static_cast(desc.BScaleTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[5], ToOperatorFieldType(static_cast(desc.BZeroPointTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[6], ToOperatorFieldType(static_cast(desc.BiasTensor))), + OperatorField(&DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA.Fields[7], ToOperatorFieldType(static_cast(desc.OutputTensor))), + }; +} inline std::vector GetFields(const DML_ACTIVATION_ELU_OPERATOR_DESC& desc) { return { @@ -1680,6 +1692,23 @@ inline std::vector GetFields(const DML_ACTIVATION_GELU_OPERATOR_D OperatorField(&DML_ACTIVATION_GELU_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.OutputTensor))), }; } +inline std::vector GetFields(const DML_ACTIVATION_SWISH_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.InputTensor))), + OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.OutputTensor))), + OperatorField(&DML_ACTIVATION_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.SigmoidInputScale))), + }; +} +inline std::vector GetFields(const DML_ACTIVATION_HARD_SWISH_OPERATOR_DESC& desc) +{ + return { + OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[0], ToOperatorFieldType(static_cast(desc.InputTensor))), + OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[1], ToOperatorFieldType(static_cast(desc.OutputTensor))), + OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[2], ToOperatorFieldType(static_cast(desc.Alpha))), + OperatorField(&DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA.Fields[3], ToOperatorFieldType(static_cast(desc.Beta))), + }; +} inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType) { switch (operatorType) @@ -1826,6 +1855,8 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType) case DML_OPERATOR_RESAMPLE_GRAD1: return DML_RESAMPLE_GRAD1_OPERATOR_SCHEMA; case DML_OPERATOR_DIAGONAL_MATRIX1: return DML_DIAGONAL_MATRIX1_OPERATOR_SCHEMA; case DML_OPERATOR_MULTIHEAD_ATTENTION: return DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA; + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: return DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA; + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: return DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_ELU: return DML_ACTIVATION_ELU_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_CELU: return DML_ACTIVATION_CELU_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_HARDMAX: return DML_ACTIVATION_HARDMAX_OPERATOR_SCHEMA; @@ -1850,6 +1881,8 @@ inline const DML_OPERATOR_SCHEMA& GetSchema(DML_OPERATOR_TYPE operatorType) case DML_OPERATOR_ACTIVATION_THRESHOLDED_RELU: return DML_ACTIVATION_THRESHOLDED_RELU_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_SHRINK: return DML_ACTIVATION_SHRINK_OPERATOR_SCHEMA; case DML_OPERATOR_ACTIVATION_GELU: return DML_ACTIVATION_GELU_OPERATOR_SCHEMA; + case DML_OPERATOR_ACTIVATION_SWISH: return DML_ACTIVATION_SWISH_OPERATOR_SCHEMA; + case DML_OPERATOR_ACTIVATION_HARD_SWISH: return DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA; default: ORT_THROW_HR(E_INVALIDARG); @@ -2431,6 +2464,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc) return AbstractOperatorDesc( &DML_MULTIHEAD_ATTENTION_OPERATOR_SCHEMA, GetFields(*static_cast(opDesc.Desc))); + case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: + return AbstractOperatorDesc( + &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); + case DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT: + return AbstractOperatorDesc( + &DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); case DML_OPERATOR_ACTIVATION_ELU: return AbstractOperatorDesc( &DML_ACTIVATION_ELU_OPERATOR_SCHEMA, @@ -2527,13 +2568,14 @@ inline AbstractOperatorDesc ConvertOperatorDesc(const DML_OPERATOR_DESC& opDesc) return AbstractOperatorDesc( &DML_ACTIVATION_GELU_OPERATOR_SCHEMA, GetFields(*static_cast(opDesc.Desc))); -#pragma warning(push) -#pragma warning(disable: 4063) - case DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING: + case DML_OPERATOR_ACTIVATION_SWISH: return AbstractOperatorDesc( - &DML_QUANTIZED_LINEAR_AVERAGE_POOLING_OPERATOR_SCHEMA, - GetFields(*static_cast(opDesc.Desc))); -#pragma warning(pop) + &DML_ACTIVATION_SWISH_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); + case DML_OPERATOR_ACTIVATION_HARD_SWISH: + return AbstractOperatorDesc( + &DML_ACTIVATION_HARD_SWISH_OPERATOR_SCHEMA, + GetFields(*static_cast(opDesc.Desc))); default: ORT_THROW_HR(E_INVALIDARG); diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h index 25f0dd26c6067..a94bb67b68d36 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/GeneratedSchemaTypes.h @@ -15,32 +15,34 @@ using ApiAttributeVariant = std::variant< const FLOAT*, const DML_SCALE_BIAS*, DML_SIZE_2D, - DML_SCALAR_UNION + DML_SCALAR_UNION, + BOOL >; namespace OperatorFieldTypes { using TensorDesc = std::optional; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC using TensorDescArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_TENSOR_DESC_ARRAY - using OperatorDesc = std::optional; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC - using OperatorDescArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY + using FusedActivationOperatorDesc = std::optional; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC + using FusedActivationOperatorDescArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_OPERATOR_DESC_ARRAY using UInt = uint32_t; // DML_SCHEMA_FIELD_TYPE_UINT using UInt64 = uint64_t; // DML_SCHEMA_FIELD_TYPE_UINT64 using Int = int32_t; // DML_SCHEMA_FIELD_TYPE_INT using Float = float; // DML_SCHEMA_FIELD_TYPE_FLOAT - using UIntArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY - using IntArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY - using FloatArray = std::optional>; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY + using UIntArray = std::vector; // DML_SCHEMA_FIELD_TYPE_UINT_ARRAY + using IntArray = std::vector; // DML_SCHEMA_FIELD_TYPE_INT_ARRAY + using FloatArray = std::vector; // DML_SCHEMA_FIELD_TYPE_FLOAT_ARRAY using ScaleBias = std::optional; // DML_SCHEMA_FIELD_TYPE_SCALE_BIAS using Size2D = DML_SIZE_2D; // DML_SCHEMA_FIELD_TYPE_SIZE_2D using ScalarUnion = DML_SCALAR_UNION; // DML_SCHEMA_FIELD_TYPE_SCALAR_UNION + using Bool = bool; // DML_SCHEMA_FIELD_TYPE_BOOL } using OperatorFieldVariant = std::variant< OperatorFieldTypes::TensorDesc, OperatorFieldTypes::TensorDescArray, - OperatorFieldTypes::OperatorDesc, - OperatorFieldTypes::OperatorDescArray, + OperatorFieldTypes::FusedActivationOperatorDesc, + OperatorFieldTypes::FusedActivationOperatorDescArray, OperatorFieldTypes::UInt, OperatorFieldTypes::UInt64, OperatorFieldTypes::Int, @@ -50,7 +52,8 @@ using OperatorFieldVariant = std::variant< OperatorFieldTypes::FloatArray, OperatorFieldTypes::ScaleBias, OperatorFieldTypes::Size2D, - OperatorFieldTypes::ScalarUnion + OperatorFieldTypes::ScalarUnion, + OperatorFieldTypes::Bool >; class OperatorField @@ -80,11 +83,11 @@ class OperatorField const OperatorFieldTypes::TensorDescArray& AsTensorDescArray() const { return std::get(m_data); } OperatorFieldTypes::TensorDescArray& AsTensorDescArray() { return std::get(m_data); } - const OperatorFieldTypes::OperatorDesc& AsOperatorDesc() const { return std::get(m_data); } - OperatorFieldTypes::OperatorDesc& AsOperatorDesc() { return std::get(m_data); } + const OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() const { return std::get(m_data); } + OperatorFieldTypes::FusedActivationOperatorDesc& AsFusedActivationOperatorDesc() { return std::get(m_data); } - const OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() const { return std::get(m_data); } - OperatorFieldTypes::OperatorDescArray& AsOperatorDescArray() { return std::get(m_data); } + const OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() const { return std::get(m_data); } + OperatorFieldTypes::FusedActivationOperatorDescArray& AsFusedActivationOperatorDescArray() { return std::get(m_data); } const OperatorFieldTypes::UInt& AsUInt() const { return std::get(m_data); } OperatorFieldTypes::UInt& AsUInt() { return std::get(m_data); } @@ -116,6 +119,9 @@ class OperatorField const OperatorFieldTypes::ScalarUnion& AsScalarUnion() const { return std::get(m_data); } OperatorFieldTypes::ScalarUnion& AsScalarUnion() { return std::get(m_data); } + const OperatorFieldTypes::Bool& AsBool() const { return std::get(m_data); } + OperatorFieldTypes::Bool& AsBool() { return std::get(m_data); } + private: const DML_SCHEMA_FIELD* m_schema; OperatorFieldVariant m_data; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h new file mode 100644 index 0000000000000..167a913bb0132 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h @@ -0,0 +1,1318 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_ +#define FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_ + +#include "flatbuffers/flatbuffers.h" + +namespace dml { +namespace ir { +namespace operatorFieldTypes { + +struct AttributeDesc; +struct AttributeDescBuilder; + +struct Activation; +struct ActivationBuilder; + +struct ActivationArray; +struct ActivationArrayBuilder; + +struct UInt8; + +struct UInt16; + +struct UInt32; + +struct UInt64; + +struct Int8; + +struct Int16; + +struct Int32; + +struct Int64; + +struct Float32; + +struct Float64; + +struct UIntArray; +struct UIntArrayBuilder; + +struct IntArray; +struct IntArrayBuilder; + +struct FloatArray; +struct FloatArrayBuilder; + +struct ScaleBias; + +struct Size2D; + +struct ByteArray; + +struct ScalarUnionData; +struct ScalarUnionDataBuilder; + +struct Bool; + +enum AttributeFieldVariant { + AttributeFieldVariant_NONE = 0, + AttributeFieldVariant_Activation = 1, + AttributeFieldVariant_ActivationArray = 2, + AttributeFieldVariant_UInt32 = 3, + AttributeFieldVariant_UInt64 = 4, + AttributeFieldVariant_Int32 = 5, + AttributeFieldVariant_Float32 = 6, + AttributeFieldVariant_UIntArray = 7, + AttributeFieldVariant_IntArray = 8, + AttributeFieldVariant_FloatArray = 9, + AttributeFieldVariant_ScaleBias = 10, + AttributeFieldVariant_Size2D = 11, + AttributeFieldVariant_ScalarUnionData = 12, + AttributeFieldVariant_Bool = 13, + AttributeFieldVariant_MIN = AttributeFieldVariant_NONE, + AttributeFieldVariant_MAX = AttributeFieldVariant_Bool +}; + +inline const AttributeFieldVariant (&EnumValuesAttributeFieldVariant())[14] { + static const AttributeFieldVariant values[] = { + AttributeFieldVariant_NONE, + AttributeFieldVariant_Activation, + AttributeFieldVariant_ActivationArray, + AttributeFieldVariant_UInt32, + AttributeFieldVariant_UInt64, + AttributeFieldVariant_Int32, + AttributeFieldVariant_Float32, + AttributeFieldVariant_UIntArray, + AttributeFieldVariant_IntArray, + AttributeFieldVariant_FloatArray, + AttributeFieldVariant_ScaleBias, + AttributeFieldVariant_Size2D, + AttributeFieldVariant_ScalarUnionData, + AttributeFieldVariant_Bool + }; + return values; +} + +inline const char * const *EnumNamesAttributeFieldVariant() { + static const char * const names[15] = { + "NONE", + "Activation", + "ActivationArray", + "UInt32", + "UInt64", + "Int32", + "Float32", + "UIntArray", + "IntArray", + "FloatArray", + "ScaleBias", + "Size2D", + "ScalarUnionData", + "Bool", + nullptr + }; + return names; +} + +inline const char *EnumNameAttributeFieldVariant(AttributeFieldVariant e) { + if (flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return ""; + const size_t index = static_cast(e); + return EnumNamesAttributeFieldVariant()[index]; +} + +template struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_NONE; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_Activation; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_ActivationArray; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt32; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_UInt64; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_Int32; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_Float32; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_UIntArray; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_IntArray; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_FloatArray; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScaleBias; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_Size2D; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_ScalarUnionData; +}; + +template<> struct AttributeFieldVariantTraits { + static const AttributeFieldVariant enum_value = AttributeFieldVariant_Bool; +}; + +bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type); +bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); + +enum ScalarVariant { + ScalarVariant_NONE = 0, + ScalarVariant_ByteArray = 1, + ScalarVariant_Int8 = 2, + ScalarVariant_UInt8 = 3, + ScalarVariant_Int16 = 4, + ScalarVariant_UInt16 = 5, + ScalarVariant_Int32 = 6, + ScalarVariant_UInt32 = 7, + ScalarVariant_Int64 = 8, + ScalarVariant_UInt64 = 9, + ScalarVariant_Float32 = 10, + ScalarVariant_Float64 = 11, + ScalarVariant_MIN = ScalarVariant_NONE, + ScalarVariant_MAX = ScalarVariant_Float64 +}; + +inline const ScalarVariant (&EnumValuesScalarVariant())[12] { + static const ScalarVariant values[] = { + ScalarVariant_NONE, + ScalarVariant_ByteArray, + ScalarVariant_Int8, + ScalarVariant_UInt8, + ScalarVariant_Int16, + ScalarVariant_UInt16, + ScalarVariant_Int32, + ScalarVariant_UInt32, + ScalarVariant_Int64, + ScalarVariant_UInt64, + ScalarVariant_Float32, + ScalarVariant_Float64 + }; + return values; +} + +inline const char * const *EnumNamesScalarVariant() { + static const char * const names[13] = { + "NONE", + "ByteArray", + "Int8", + "UInt8", + "Int16", + "UInt16", + "Int32", + "UInt32", + "Int64", + "UInt64", + "Float32", + "Float64", + nullptr + }; + return names; +} + +inline const char *EnumNameScalarVariant(ScalarVariant e) { + if (flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return ""; + const size_t index = static_cast(e); + return EnumNamesScalarVariant()[index]; +} + +template struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_NONE; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_ByteArray; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Int8; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_UInt8; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Int16; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_UInt16; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Int32; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_UInt32; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Int64; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_UInt64; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Float32; +}; + +template<> struct ScalarVariantTraits { + static const ScalarVariant enum_value = ScalarVariant_Float64; +}; + +bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type); +bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) UInt8 FLATBUFFERS_FINAL_CLASS { + private: + uint8_t data_; + + public: + UInt8() { + memset(static_cast(this), 0, sizeof(UInt8)); + } + UInt8(uint8_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + uint8_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(uint8_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(UInt8, 1); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) UInt16 FLATBUFFERS_FINAL_CLASS { + private: + uint16_t data_; + + public: + UInt16() { + memset(static_cast(this), 0, sizeof(UInt16)); + } + UInt16(uint16_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + uint16_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(uint16_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(UInt16, 2); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) UInt32 FLATBUFFERS_FINAL_CLASS { + private: + uint32_t data_; + + public: + UInt32() { + memset(static_cast(this), 0, sizeof(UInt32)); + } + UInt32(uint32_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + uint32_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(uint32_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(UInt32, 4); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) UInt64 FLATBUFFERS_FINAL_CLASS { + private: + uint64_t data_; + + public: + UInt64() { + memset(static_cast(this), 0, sizeof(UInt64)); + } + UInt64(uint64_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + uint64_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(uint64_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(UInt64, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Int8 FLATBUFFERS_FINAL_CLASS { + private: + int8_t data_; + + public: + Int8() { + memset(static_cast(this), 0, sizeof(Int8)); + } + Int8(int8_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + int8_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(int8_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Int8, 1); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Int16 FLATBUFFERS_FINAL_CLASS { + private: + int16_t data_; + + public: + Int16() { + memset(static_cast(this), 0, sizeof(Int16)); + } + Int16(int16_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + int16_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(int16_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Int16, 2); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Int32 FLATBUFFERS_FINAL_CLASS { + private: + int32_t data_; + + public: + Int32() { + memset(static_cast(this), 0, sizeof(Int32)); + } + Int32(int32_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + int32_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(int32_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Int32, 4); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int64 FLATBUFFERS_FINAL_CLASS { + private: + int64_t data_; + + public: + Int64() { + memset(static_cast(this), 0, sizeof(Int64)); + } + Int64(int64_t _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + int64_t data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(int64_t _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Int64, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Float32 FLATBUFFERS_FINAL_CLASS { + private: + float data_; + + public: + Float32() { + memset(static_cast(this), 0, sizeof(Float32)); + } + Float32(float _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + float data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(float _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Float32, 4); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Float64 FLATBUFFERS_FINAL_CLASS { + private: + double data_; + + public: + Float64() { + memset(static_cast(this), 0, sizeof(Float64)); + } + Float64(double _data) + : data_(flatbuffers::EndianScalar(_data)) { + } + double data() const { + return flatbuffers::EndianScalar(data_); + } + void mutate_data(double _data) { + flatbuffers::WriteScalar(&data_, _data); + } +}; +FLATBUFFERS_STRUCT_END(Float64, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) ScaleBias FLATBUFFERS_FINAL_CLASS { + private: + float scale_; + float bias_; + + public: + ScaleBias() { + memset(static_cast(this), 0, sizeof(ScaleBias)); + } + ScaleBias(float _scale, float _bias) + : scale_(flatbuffers::EndianScalar(_scale)), + bias_(flatbuffers::EndianScalar(_bias)) { + } + float scale() const { + return flatbuffers::EndianScalar(scale_); + } + void mutate_scale(float _scale) { + flatbuffers::WriteScalar(&scale_, _scale); + } + float bias() const { + return flatbuffers::EndianScalar(bias_); + } + void mutate_bias(float _bias) { + flatbuffers::WriteScalar(&bias_, _bias); + } +}; +FLATBUFFERS_STRUCT_END(ScaleBias, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Size2D FLATBUFFERS_FINAL_CLASS { + private: + uint32_t width_; + uint32_t height_; + + public: + Size2D() { + memset(static_cast(this), 0, sizeof(Size2D)); + } + Size2D(uint32_t _width, uint32_t _height) + : width_(flatbuffers::EndianScalar(_width)), + height_(flatbuffers::EndianScalar(_height)) { + } + uint32_t width() const { + return flatbuffers::EndianScalar(width_); + } + void mutate_width(uint32_t _width) { + flatbuffers::WriteScalar(&width_, _width); + } + uint32_t height() const { + return flatbuffers::EndianScalar(height_); + } + void mutate_height(uint32_t _height) { + flatbuffers::WriteScalar(&height_, _height); + } +}; +FLATBUFFERS_STRUCT_END(Size2D, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) ByteArray FLATBUFFERS_FINAL_CLASS { + private: + uint8_t data_[8]; + + public: + ByteArray() { + memset(static_cast(this), 0, sizeof(ByteArray)); + } + const flatbuffers::Array *data() const { + return reinterpret_cast *>(data_); + } + flatbuffers::Array *mutable_data() { + return reinterpret_cast *>(data_); + } +}; +FLATBUFFERS_STRUCT_END(ByteArray, 8); + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS { + private: + uint8_t data_; + + public: + Bool() { + memset(static_cast(this), 0, sizeof(Bool)); + } + Bool(bool _data) + : data_(flatbuffers::EndianScalar(static_cast(_data))) { + } + bool data() const { + return flatbuffers::EndianScalar(data_) != 0; + } + void mutate_data(bool _data) { + flatbuffers::WriteScalar(&data_, static_cast(_data)); + } +}; +FLATBUFFERS_STRUCT_END(Bool, 1); + +struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef AttributeDescBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_NAME = 4, + VT_VAL_TYPE = 6, + VT_VAL = 8 + }; + const flatbuffers::String *name() const { + return GetPointer(VT_NAME); + } + flatbuffers::String *mutable_name() { + return GetPointer(VT_NAME); + } + dml::ir::operatorFieldTypes::AttributeFieldVariant val_type() const { + return static_cast(GetField(VT_VAL_TYPE, 0)); + } + const void *val() const { + return GetPointer(VT_VAL); + } + template const T *val_as() const; + const dml::ir::operatorFieldTypes::Activation *val_as_Activation() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Activation ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::ActivationArray *val_as_ActivationArray() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ActivationArray ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt32 *val_as_UInt32() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt32 ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt64 *val_as_UInt64() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UInt64 ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::Int32 *val_as_Int32() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Int32 ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::Float32 *val_as_Float32() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Float32 ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::UIntArray *val_as_UIntArray() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_UIntArray ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::IntArray *val_as_IntArray() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_IntArray ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::FloatArray *val_as_FloatArray() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_FloatArray ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::ScaleBias *val_as_ScaleBias() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScaleBias ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::Size2D *val_as_Size2D() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Size2D ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::ScalarUnionData *val_as_ScalarUnionData() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_ScalarUnionData ? static_cast(val()) : nullptr; + } + const dml::ir::operatorFieldTypes::Bool *val_as_Bool() const { + return val_type() == dml::ir::operatorFieldTypes::AttributeFieldVariant_Bool ? static_cast(val()) : nullptr; + } + void *mutable_val() { + return GetPointer(VT_VAL); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_NAME) && + verifier.VerifyString(name()) && + VerifyField(verifier, VT_VAL_TYPE) && + VerifyOffset(verifier, VT_VAL) && + VerifyAttributeFieldVariant(verifier, val(), val_type()) && + verifier.EndTable(); + } +}; + +template<> inline const dml::ir::operatorFieldTypes::Activation *AttributeDesc::val_as() const { + return val_as_Activation(); +} + +template<> inline const dml::ir::operatorFieldTypes::ActivationArray *AttributeDesc::val_as() const { + return val_as_ActivationArray(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt32 *AttributeDesc::val_as() const { + return val_as_UInt32(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt64 *AttributeDesc::val_as() const { + return val_as_UInt64(); +} + +template<> inline const dml::ir::operatorFieldTypes::Int32 *AttributeDesc::val_as() const { + return val_as_Int32(); +} + +template<> inline const dml::ir::operatorFieldTypes::Float32 *AttributeDesc::val_as() const { + return val_as_Float32(); +} + +template<> inline const dml::ir::operatorFieldTypes::UIntArray *AttributeDesc::val_as() const { + return val_as_UIntArray(); +} + +template<> inline const dml::ir::operatorFieldTypes::IntArray *AttributeDesc::val_as() const { + return val_as_IntArray(); +} + +template<> inline const dml::ir::operatorFieldTypes::FloatArray *AttributeDesc::val_as() const { + return val_as_FloatArray(); +} + +template<> inline const dml::ir::operatorFieldTypes::ScaleBias *AttributeDesc::val_as() const { + return val_as_ScaleBias(); +} + +template<> inline const dml::ir::operatorFieldTypes::Size2D *AttributeDesc::val_as() const { + return val_as_Size2D(); +} + +template<> inline const dml::ir::operatorFieldTypes::ScalarUnionData *AttributeDesc::val_as() const { + return val_as_ScalarUnionData(); +} + +template<> inline const dml::ir::operatorFieldTypes::Bool *AttributeDesc::val_as() const { + return val_as_Bool(); +} + +struct AttributeDescBuilder { + typedef AttributeDesc Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_name(flatbuffers::Offset name) { + fbb_.AddOffset(AttributeDesc::VT_NAME, name); + } + void add_val_type(dml::ir::operatorFieldTypes::AttributeFieldVariant val_type) { + fbb_.AddElement(AttributeDesc::VT_VAL_TYPE, static_cast(val_type), 0); + } + void add_val(flatbuffers::Offset val) { + fbb_.AddOffset(AttributeDesc::VT_VAL, val); + } + explicit AttributeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + AttributeDescBuilder &operator=(const AttributeDescBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateAttributeDesc( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset name = 0, + dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE, + flatbuffers::Offset val = 0) { + AttributeDescBuilder builder_(_fbb); + builder_.add_val(val); + builder_.add_name(name); + builder_.add_val_type(val_type); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateAttributeDescDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *name = nullptr, + dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE, + flatbuffers::Offset val = 0) { + auto name__ = name ? _fbb.CreateString(name) : 0; + return dml::ir::operatorFieldTypes::CreateAttributeDesc( + _fbb, + name__, + val_type, + val); +} + +struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ActivationBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_TYPE = 4, + VT_ATTRIBUTES = 6 + }; + const flatbuffers::String *type() const { + return GetPointer(VT_TYPE); + } + flatbuffers::String *mutable_type() { + return GetPointer(VT_TYPE); + } + const flatbuffers::Vector> *attributes() const { + return GetPointer> *>(VT_ATTRIBUTES); + } + flatbuffers::Vector> *mutable_attributes() { + return GetPointer> *>(VT_ATTRIBUTES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_TYPE) && + verifier.VerifyString(type()) && + VerifyOffset(verifier, VT_ATTRIBUTES) && + verifier.VerifyVector(attributes()) && + verifier.VerifyVectorOfTables(attributes()) && + verifier.EndTable(); + } +}; + +struct ActivationBuilder { + typedef Activation Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_type(flatbuffers::Offset type) { + fbb_.AddOffset(Activation::VT_TYPE, type); + } + void add_attributes(flatbuffers::Offset>> attributes) { + fbb_.AddOffset(Activation::VT_ATTRIBUTES, attributes); + } + explicit ActivationBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ActivationBuilder &operator=(const ActivationBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateActivation( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset type = 0, + flatbuffers::Offset>> attributes = 0) { + ActivationBuilder builder_(_fbb); + builder_.add_attributes(attributes); + builder_.add_type(type); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateActivationDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const char *type = nullptr, + const std::vector> *attributes = nullptr) { + auto type__ = type ? _fbb.CreateString(type) : 0; + auto attributes__ = attributes ? _fbb.CreateVector>(*attributes) : 0; + return dml::ir::operatorFieldTypes::CreateActivation( + _fbb, + type__, + attributes__); +} + +struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ActivationArrayBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA = 4 + }; + const flatbuffers::Vector> *data() const { + return GetPointer> *>(VT_DATA); + } + flatbuffers::Vector> *mutable_data() { + return GetPointer> *>(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATA) && + verifier.VerifyVector(data()) && + verifier.VerifyVectorOfTables(data()) && + verifier.EndTable(); + } +}; + +struct ActivationArrayBuilder { + typedef ActivationArray Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data(flatbuffers::Offset>> data) { + fbb_.AddOffset(ActivationArray::VT_DATA, data); + } + explicit ActivationArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ActivationArrayBuilder &operator=(const ActivationArrayBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateActivationArray( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset>> data = 0) { + ActivationArrayBuilder builder_(_fbb); + builder_.add_data(data); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateActivationArrayDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector> *data = nullptr) { + auto data__ = data ? _fbb.CreateVector>(*data) : 0; + return dml::ir::operatorFieldTypes::CreateActivationArray( + _fbb, + data__); +} + +struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef UIntArrayBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA = 4 + }; + const flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); + } + flatbuffers::Vector *mutable_data() { + return GetPointer *>(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATA) && + verifier.VerifyVector(data()) && + verifier.EndTable(); + } +}; + +struct UIntArrayBuilder { + typedef UIntArray Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data(flatbuffers::Offset> data) { + fbb_.AddOffset(UIntArray::VT_DATA, data); + } + explicit UIntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + UIntArrayBuilder &operator=(const UIntArrayBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateUIntArray( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> data = 0) { + UIntArrayBuilder builder_(_fbb); + builder_.add_data(data); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateUIntArrayDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *data = nullptr) { + auto data__ = data ? _fbb.CreateVector(*data) : 0; + return dml::ir::operatorFieldTypes::CreateUIntArray( + _fbb, + data__); +} + +struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef IntArrayBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA = 4 + }; + const flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); + } + flatbuffers::Vector *mutable_data() { + return GetPointer *>(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATA) && + verifier.VerifyVector(data()) && + verifier.EndTable(); + } +}; + +struct IntArrayBuilder { + typedef IntArray Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data(flatbuffers::Offset> data) { + fbb_.AddOffset(IntArray::VT_DATA, data); + } + explicit IntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + IntArrayBuilder &operator=(const IntArrayBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateIntArray( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> data = 0) { + IntArrayBuilder builder_(_fbb); + builder_.add_data(data); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateIntArrayDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *data = nullptr) { + auto data__ = data ? _fbb.CreateVector(*data) : 0; + return dml::ir::operatorFieldTypes::CreateIntArray( + _fbb, + data__); +} + +struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef FloatArrayBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA = 4 + }; + const flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); + } + flatbuffers::Vector *mutable_data() { + return GetPointer *>(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_DATA) && + verifier.VerifyVector(data()) && + verifier.EndTable(); + } +}; + +struct FloatArrayBuilder { + typedef FloatArray Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data(flatbuffers::Offset> data) { + fbb_.AddOffset(FloatArray::VT_DATA, data); + } + explicit FloatArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + FloatArrayBuilder &operator=(const FloatArrayBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateFloatArray( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> data = 0) { + FloatArrayBuilder builder_(_fbb); + builder_.add_data(data); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateFloatArrayDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *data = nullptr) { + auto data__ = data ? _fbb.CreateVector(*data) : 0; + return dml::ir::operatorFieldTypes::CreateFloatArray( + _fbb, + data__); +} + +struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef ScalarUnionDataBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_DATA_TYPE = 4, + VT_DATA = 6 + }; + dml::ir::operatorFieldTypes::ScalarVariant data_type() const { + return static_cast(GetField(VT_DATA_TYPE, 0)); + } + const void *data() const { + return GetPointer(VT_DATA); + } + template const T *data_as() const; + const dml::ir::operatorFieldTypes::ByteArray *data_as_ByteArray() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_ByteArray ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Int8 *data_as_Int8() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int8 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt8 *data_as_UInt8() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt8 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Int16 *data_as_Int16() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int16 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt16 *data_as_UInt16() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt16 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Int32 *data_as_Int32() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int32 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt32 *data_as_UInt32() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt32 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Int64 *data_as_Int64() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Int64 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::UInt64 *data_as_UInt64() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_UInt64 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Float32 *data_as_Float32() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float32 ? static_cast(data()) : nullptr; + } + const dml::ir::operatorFieldTypes::Float64 *data_as_Float64() const { + return data_type() == dml::ir::operatorFieldTypes::ScalarVariant_Float64 ? static_cast(data()) : nullptr; + } + void *mutable_data() { + return GetPointer(VT_DATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_DATA_TYPE) && + VerifyOffset(verifier, VT_DATA) && + VerifyScalarVariant(verifier, data(), data_type()) && + verifier.EndTable(); + } +}; + +template<> inline const dml::ir::operatorFieldTypes::ByteArray *ScalarUnionData::data_as() const { + return data_as_ByteArray(); +} + +template<> inline const dml::ir::operatorFieldTypes::Int8 *ScalarUnionData::data_as() const { + return data_as_Int8(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt8 *ScalarUnionData::data_as() const { + return data_as_UInt8(); +} + +template<> inline const dml::ir::operatorFieldTypes::Int16 *ScalarUnionData::data_as() const { + return data_as_Int16(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt16 *ScalarUnionData::data_as() const { + return data_as_UInt16(); +} + +template<> inline const dml::ir::operatorFieldTypes::Int32 *ScalarUnionData::data_as() const { + return data_as_Int32(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt32 *ScalarUnionData::data_as() const { + return data_as_UInt32(); +} + +template<> inline const dml::ir::operatorFieldTypes::Int64 *ScalarUnionData::data_as() const { + return data_as_Int64(); +} + +template<> inline const dml::ir::operatorFieldTypes::UInt64 *ScalarUnionData::data_as() const { + return data_as_UInt64(); +} + +template<> inline const dml::ir::operatorFieldTypes::Float32 *ScalarUnionData::data_as() const { + return data_as_Float32(); +} + +template<> inline const dml::ir::operatorFieldTypes::Float64 *ScalarUnionData::data_as() const { + return data_as_Float64(); +} + +struct ScalarUnionDataBuilder { + typedef ScalarUnionData Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_data_type(dml::ir::operatorFieldTypes::ScalarVariant data_type) { + fbb_.AddElement(ScalarUnionData::VT_DATA_TYPE, static_cast(data_type), 0); + } + void add_data(flatbuffers::Offset data) { + fbb_.AddOffset(ScalarUnionData::VT_DATA, data); + } + explicit ScalarUnionDataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + ScalarUnionDataBuilder &operator=(const ScalarUnionDataBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateScalarUnionData( + flatbuffers::FlatBufferBuilder &_fbb, + dml::ir::operatorFieldTypes::ScalarVariant data_type = dml::ir::operatorFieldTypes::ScalarVariant_NONE, + flatbuffers::Offset data = 0) { + ScalarUnionDataBuilder builder_(_fbb); + builder_.add_data(data); + builder_.add_data_type(data_type); + return builder_.Finish(); +} + +inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) { + switch (type) { + case AttributeFieldVariant_NONE: { + return true; + } + case AttributeFieldVariant_Activation: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_ActivationArray: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_UInt32: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_UInt64: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_Int32: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_Float32: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_UIntArray: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_IntArray: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_FloatArray: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_ScaleBias: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_Size2D: { + return verifier.Verify(static_cast(obj), 0); + } + case AttributeFieldVariant_ScalarUnionData: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case AttributeFieldVariant_Bool: { + return verifier.Verify(static_cast(obj), 0); + } + default: return true; + } +} + +inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyAttributeFieldVariant( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +inline bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) { + switch (type) { + case ScalarVariant_NONE: { + return true; + } + case ScalarVariant_ByteArray: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Int8: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_UInt8: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Int16: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_UInt16: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Int32: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_UInt32: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Int64: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_UInt64: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Float32: { + return verifier.Verify(static_cast(obj), 0); + } + case ScalarVariant_Float64: { + return verifier.Verify(static_cast(obj), 0); + } + default: return true; + } +} + +inline bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { + if (!values || !types) return !values && !types; + if (values->size() != types->size()) return false; + for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + if (!VerifyScalarVariant( + verifier, values->Get(i), types->GetEnum(i))) { + return false; + } + } + return true; +} + +} // namespace operatorFieldTypes +} // namespace ir +} // namespace dml + +#endif // FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_ diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h index 5285481485184..1bc694dfe90c2 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/SchemaHelpers.h @@ -26,14 +26,14 @@ namespace SchemaHelpers return field; } - inline OperatorFieldTypes::OperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value) + inline OperatorFieldTypes::FusedActivationOperatorDesc ToOperatorFieldType(const DML_OPERATOR_DESC* value) { - return value ? OperatorFieldTypes::OperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt; + return value ? OperatorFieldTypes::FusedActivationOperatorDesc(ConvertOperatorDesc(*value)) : std::nullopt; } - inline OperatorFieldTypes::OperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count) + inline OperatorFieldTypes::FusedActivationOperatorDescArray ToOperatorFieldType(const DML_OPERATOR_DESC* values, uint32_t count) { - OperatorFieldTypes::OperatorDescArray field; + OperatorFieldTypes::FusedActivationOperatorDescArray field; if (values && count != 0) { field.emplace(count); @@ -65,13 +65,17 @@ namespace SchemaHelpers return value; } + inline OperatorFieldTypes::Bool ToOperatorFieldType(bool value) + { + return value; + } + inline OperatorFieldTypes::UIntArray ToOperatorFieldType(const uint32_t* values, uint32_t count) { OperatorFieldTypes::UIntArray field; if (values && count != 0) { - field.emplace(count); - std::copy_n(values, count, field->begin()); + field.assign(values, values + count); } return field; } @@ -81,8 +85,7 @@ namespace SchemaHelpers OperatorFieldTypes::IntArray field; if (values && count != 0) { - field.emplace(count); - std::copy_n(values, count, field->begin()); + field.assign(values, values + count); } return field; } @@ -92,8 +95,7 @@ namespace SchemaHelpers OperatorFieldTypes::FloatArray field; if (values && count != 0) { - field.emplace(count); - std::copy_n(values, count, field->begin()); + field.assign(values, values + count); } return field; } @@ -237,7 +239,7 @@ namespace SchemaHelpers { DML_OPERATOR_DESC* desc = nullptr; - const auto& value = field.AsOperatorDesc(); + const auto& value = field.AsFusedActivationOperatorDesc(); if (value) { desc = allocator->template Allocate(); @@ -251,7 +253,7 @@ namespace SchemaHelpers { DML_OPERATOR_DESC* descs = nullptr; - const auto& values = field.AsOperatorDescArray(); + const auto& values = field.AsFusedActivationOperatorDescArray(); if (values) { descs = allocator->template Allocate(values->size()); @@ -288,16 +290,20 @@ namespace SchemaHelpers dst->Write(value); } break; + case DML_SCHEMA_FIELD_TYPE_BOOL: + { + // OperatorFieldTypes::Bool is a 'bool' (1 byte) but written as 'BOOL' in op descs (4 bytes). + BOOL value = static_cast(field.AsBool()); + dst->Write(value); + } break; + case DML_SCHEMA_FIELD_TYPE_UINT_ARRAY: { uint32_t* arrayPtr = nullptr; const auto& values = field.AsUIntArray(); - if (values) - { - arrayPtr = allocator->template Allocate(values->size()); - std::copy(values->begin(), values->end(), arrayPtr); - } + arrayPtr = allocator->template Allocate(values.size()); + std::copy(values.begin(), values.end(), arrayPtr); dst->Write(arrayPtr); } break; @@ -307,11 +313,8 @@ namespace SchemaHelpers int32_t* arrayPtr = nullptr; const auto& values = field.AsIntArray(); - if (values) - { - arrayPtr = allocator->template Allocate(values->size()); - std::copy(values->begin(), values->end(), arrayPtr); - } + arrayPtr = allocator->template Allocate(values.size()); + std::copy(values.begin(), values.end(), arrayPtr); dst->Write(arrayPtr); } break; @@ -321,11 +324,8 @@ namespace SchemaHelpers float* arrayPtr = nullptr; const auto& values = field.AsFloatArray(); - if (values) - { - arrayPtr = allocator->template Allocate(values->size()); - std::copy(values->begin(), values->end(), arrayPtr); - } + arrayPtr = allocator->template Allocate(values.size()); + std::copy(values.begin(), values.end(), arrayPtr); dst->Write(arrayPtr); } break; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp index 2456b396de3f6..e6f008af5c23f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp @@ -33,10 +33,10 @@ namespace Dml::GraphDescBuilder #pragma warning(pop) static void RemoveUnconnectedNodes( - std::vector& graphNodes, - std::vector& graphInputEdges, - std::vector& graphIntermediateEdges, - std::vector& graphOutputEdges) + std::vector& graphNodes, + std::vector& graphInputEdges, + std::vector& graphIntermediateEdges, + std::vector& graphOutputEdges) { enum class NodeState { @@ -52,7 +52,7 @@ namespace Dml::GraphDescBuilder }; std::vector nodesData(graphNodes.size()); - for (const DML_INTERMEDIATE_GRAPH_EDGE_DESC& intermediateEdge : graphIntermediateEdges) + for (const DmlIntermediateSerializedGraphEdge& intermediateEdge : graphIntermediateEdges) { nodesData[intermediateEdge.ToNodeIndex].predecessorIndices.push_back(intermediateEdge.FromNodeIndex); } @@ -60,7 +60,7 @@ namespace Dml::GraphDescBuilder std::stack nodeIndicesToVisit; // Start from the outputs of the graph and traverse upwards - for (const DML_OUTPUT_GRAPH_EDGE_DESC& outputEdge : graphOutputEdges) + for (const DmlOutputSerializedGraphEdge& outputEdge : graphOutputEdges) { nodeIndicesToVisit.push(outputEdge.FromNodeIndex); } @@ -143,17 +143,44 @@ namespace Dml::GraphDescBuilder } } + + uint32_t SetAndGetDmlGraphNodeIndex( + const uint32_t operatorDmlGraphNodeIndex, + const std::string& nodeNamePrefix, + AbstractOperatorDesc& operatorDesc, + /*in_out*/std::unordered_map& operatorDmlGraphToDmlGraphNodeIndexMap, + /*in_out*/std::vector& dmlGraphNodes) + { + auto iter = operatorDmlGraphToDmlGraphNodeIndexMap.find(operatorDmlGraphNodeIndex); + if (iter != operatorDmlGraphToDmlGraphNodeIndexMap.end()) + { + return iter->second; + } + operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex] = static_cast(dmlGraphNodes.size()); + dmlGraphNodes.push_back({operatorDesc, nodeNamePrefix + std::to_string(operatorDmlGraphNodeIndex)}); + return operatorDmlGraphToDmlGraphNodeIndexMap[operatorDmlGraphNodeIndex]; + } + + // Terminology: + // Subgraph: partitioned ONNX graph from the original (main) ONNX graph + // DmlGraph: a graph in DML currency converted from subgraph. + // operatorDmlGraph: a graph in DML currency for a given node or operator + // Main Points to note: + // - GraphDesc will always has sequential indices for input and intermediate edges. + // - 1 onnx node can be converted to one or more dml nodes. GraphDesc BuildGraphDesc( const uint8_t* isConstGpuGraphInput, const size_t isConstGpuGraphInputCount, const std::unordered_map>& isInitializerTransferable, const std::unordered_map& graphNodePropertyMap, - IDMLDevice* device, const ExecutionProviderImpl* executionHandle, const onnxruntime::Path& modelPath, gsl::span subgraphNodes, gsl::span subgraphInputs, - gsl::span subgraphOutputs) + gsl::span subgraphOutputs, + /*out*/ std::unordered_map& serializedGraphInputIndexToSubgraphInputIndex, + /*out*/ std::unordered_map& serializedGraphLargeConstantNameToSubgraphInputIndex, + /*out*/ std::vector>& smallConstantData) { struct NodeAndIndex { @@ -161,19 +188,34 @@ namespace Dml::GraphDescBuilder uint32_t targetIndex; // The index of the input/output on the node (e.g. 1 for the second input on a node) }; - // Map from Lotus node argument names to the new node and index where it will be produced - std::unordered_map nameToNodeAndIndexMap; - std::unordered_map nodeOutputShapes; - // Map from Lotus node argument names to input indices of the fused kernel node. - std::unordered_map nameToDmlFusedNodeInputIndex; + // Map from ORT subgraph input names to indices + std::unordered_map subgraphInputNameToIndexMap; + + // - Map from ORT node's output names to DmlGraph . + // - Once a given ORT node (or operator) will be transformed into a operatorDmlGraph, + // then ORT node's output names will become output edges for the operatorDmlGraph. + // - This map will be populated for those output edges. + std::unordered_map dmlGraphNodeOutputNameToNodeAndIndexMap; + + // This map will be used to re-index an subGraphInputIndex to sequential input index + // for DmlGraph + std::unordered_map subGraphInputIndexToDmlGraphInputIndex; + + // Iterate through each node and create a corresponding node in the new graph + // We can iterate the nodes in any order because the edge connectivity will take care of the topological order + std::unordered_map> inferredOutputShapes; + + std::vector dmlGraphNodes; + std::vector dmlGraphInputEdges; + std::vector dmlGraphIntermediateEdges; + std::vector dmlGraphOutputEdges; for (size_t inputIndex = 0; inputIndex < subgraphInputs.size(); ++inputIndex) { - const onnxruntime::NodeArg* graphInput = subgraphInputs[inputIndex]; - - if (!graphInput) + const onnxruntime::NodeArg* subgraphInput = subgraphInputs[inputIndex]; + if (!subgraphInput) { // This is a workaround for when node inputs get manipulated by transformers outside of our control, // which then causes them to have a different name. If that happens we can't figure out how to @@ -181,45 +223,21 @@ namespace Dml::GraphDescBuilder // just bail early. ORT_THROW_HR(E_UNEXPECTED); } - - nameToDmlFusedNodeInputIndex.emplace(graphInput->Name(), gsl::narrow_cast(inputIndex)); - } - - StackAllocator<1024> allocator; // Used for converting abstract operator descs into DML_OPERATOR_DESC - - std::vector graphNodes; - std::vector graphInputEdges; - std::vector graphIntermediateEdges; - std::vector graphOutputEdges; - - // Avoid using separate command lists for small graphs. This value can be reduced by tuning the - // flushing behavior of DmlCommandRecorder. Its current behavior is to assume that graphs contain - // enough GPU work to be worth flushing immediately. - const uint32_t minNodeCountToReuseCommandList = 5; - bool reuseCommandList = false; - - if (subgraphNodes.size() >= minNodeCountToReuseCommandList || executionHandle->IsMcdmDevice()) - { - reuseCommandList = true; + subgraphInputNameToIndexMap.emplace(subgraphInput->Name(), gsl::narrow_cast(inputIndex)); } auto constantCpuGraphInputGetter = [&isInitializerTransferable, &modelPath](const std::string& argName) { ComPtr tensorWrapper; - auto iter = isInitializerTransferable.find(argName); if (iter != isInitializerTransferable.end()) { // Using const_cast here is simpler than making surrounding code const correct. tensorWrapper = wil::MakeOrThrow(const_cast(iter->second.first), modelPath); } - return tensorWrapper; }; - // Iterate through each node and create a corresponding node in the new graph - // We can iterate the nodes in any order because the edge connectivity will take care of the topological order - std::unordered_map> inferredOutputShapes; for (const onnxruntime::Node* subgraphNode : subgraphNodes) { @@ -277,195 +295,206 @@ namespace Dml::GraphDescBuilder } EdgeShapes outputShapes; - DmlGraphNodeCreateInfo graphNodeCreateInfo; + DmlGraphNodeCreateInfo operatorDmlGraphCreateInfo; graphNodeProps.internalRegInfo->graphNodeFactoryRegistration->factory( node, constantCpuNodeInputGetter, executionHandle, &inputShapesOverrides, /*out*/ &outputShapes, - /*out*/ &graphNodeCreateInfo + /*out*/ &operatorDmlGraphCreateInfo ); ORT_THROW_HR_IF(E_UNEXPECTED, outputShapes.EdgeCount() != node.OutputDefs().size()); for (int i = 0; i < node.OutputDefs().size(); ++i) { inferredOutputShapes[node.OutputDefs()[i]->Name()] = outputShapes.GetShape(i); - } - - // Create a map between operatorGraphNodeIndex to mainGraphNodeIndex. - std::unordered_map operatorGraphNodeIndexToMainGraphNodeIndexMap; - uint32_t graphNodeCount = gsl::narrow_cast(graphNodes.size()); - const bool isNodeAsOpDesc = graphNodeCreateInfo.nodesAsOperatorDesc.size() > 0; - size_t firstOpDescGraphNodeIndex = graphNodes.size(); - - if (isNodeAsOpDesc) + } + + // Algorithm: + // 1. Create constant nodes by iterating through operatorDmlGraph's input edges and keep a map of it, + // because there would be an intermediate edge from the constantNode and source of the intermediate edge + // should come before the destination. + // 2. Again iterate through operatorDmlGraph's input edges to create mainGraph's input and intermediate edges. + // 3. Iterate through operatorDmlGraph's intermediate edges to create mainGraph's intermediate edges. + // 4. Iterate through operatorDmlGraph's output edges to populate outputEdgeNameToDmlGraphNodeAndIndex + // 5. While performing step 2, 3, and 4, insert operatorDmlGraphNode to the mainDmlGraphNode list. + + for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges) { - // Can't populate graphNodes vector at this point, because operatorDesc may get modified later. - for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++) + const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex]; + if (arg->Exists()) { - ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsOperatorDesc[nodeIndex]); - operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++); - } + auto iter = subgraphInputNameToIndexMap.find(arg->Name()); + if (iter != subgraphInputNameToIndexMap.end() && + iter->second < isConstGpuGraphInputCount && + isConstGpuGraphInput[iter->second]) + { + DmlSerializedGraphNode constantNode = {}; + constantNode.Name = arg->Name(); + + // This is a highly inefficient approach to generating constant nodes. It duplicates constant data + // across the graph input as well as every consumer's unique constant node. However it is currently + // only used for small inputs. + auto& operatorDmlGraphInputNode = operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex]; + std::vector toNodeInputTensorDescs = operatorDmlGraphInputNode->GetInputTensors(); + DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex]; + ComPtr constantInput; + + if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize) + { + constantInput = constantCpuGraphInputGetter(arg->Name()); + } - graphNodes.resize(graphNodes.size() + graphNodeCreateInfo.nodeCount); - } - else - { - for (uint32_t nodeIndex = 0; nodeIndex < graphNodeCreateInfo.nodeCount; nodeIndex++) - { - ORT_THROW_HR_IF(E_UNEXPECTED, !graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex].Get()); - operatorGraphNodeIndexToMainGraphNodeIndexMap.emplace(nodeIndex, graphNodeCount++); - NodeInfo nodeInfo = {}; - nodeInfo.nodeDef = std::move(graphNodeCreateInfo.nodesAsIDMLOperator[nodeIndex]); - graphNodes.push_back(std::move(nodeInfo)); + if (constantInput) + { + // The tensor description's size should be no larger than the constant input unless it was rounded to + // the required alignment. + assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes); + size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast(tensorDesc->totalTensorSizeInBytes)); + auto data = static_cast(constantInput->GetData()); + std::vector tensorData(data, data + minimumConstantSize); + + smallConstantData.push_back(std::make_unique(tensorData.size())); + std::transform(tensorData.begin(), tensorData.end(), smallConstantData.back().get(), [](uint8_t b) {return static_cast(b);}); + + ConstantData constantData = {smallConstantData.back().get(), tensorData.size()}; + constantNode.Desc = constantData; + } + else + { + ConstantName constantFileName = {GetSanitizedFileName(arg->Name())}; + constantNode.Desc = constantFileName; + } + dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {static_cast(dmlGraphNodes.size()), 0}; + dmlGraphNodes.push_back(constantNode); + } } } - // map operatorGraphInputEdge as either mainGraphInputEdge or mainGraphIntermediateEdge - for (auto& operatorGraphInputEdge : graphNodeCreateInfo.inputEdges) - { - // operatorGraphInputEdge.GraphInputIndex will be the ONNX input index. - const onnxruntime::NodeArg* arg = node.InputDefs()[operatorGraphInputEdge.GraphInputIndex]; + // Create a map between operatorGraphNodeIndex to dmlGraphNodeIndex. + std::unordered_map operatorDmlGraphToDmlGraphNodeIndexMap; + // map operatorDmlGraphInputEdge as either mainDmlGraphInputEdge or mainDmlGraphIntermediateEdge + for (auto& operatorDmlGraphInputEdge : operatorDmlGraphCreateInfo.inputEdges) + { + // operatorDmlGraphInputEdge.GraphInputIndex will be the ONNX input index. + const onnxruntime::NodeArg* arg = node.InputDefs()[operatorDmlGraphInputEdge.GraphInputIndex]; if (arg->Exists()) { - auto iter = nameToDmlFusedNodeInputIndex.find(arg->Name()); - uint32_t mainGraphNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphInputEdge.ToNodeIndex]; - - if (iter != nameToDmlFusedNodeInputIndex.end()) + uint32_t dmlGraphNodeIndex = SetAndGetDmlGraphNodeIndex( + operatorDmlGraphInputEdge.ToNodeIndex, + node.Name(), + *operatorDmlGraphCreateInfo.nodes[operatorDmlGraphInputEdge.ToNodeIndex], + operatorDmlGraphToDmlGraphNodeIndexMap, + dmlGraphNodes); + + auto iter = subgraphInputNameToIndexMap.find(arg->Name()); + if (iter != subgraphInputNameToIndexMap.end()) { - // This is a graph input - - const uint32_t dmlFusedNodeInputIndex = iter->second; - - // If this is a constant input, set the appropriate flags on the desc - if (isNodeAsOpDesc && - dmlFusedNodeInputIndex < isConstGpuGraphInputCount && - isConstGpuGraphInput[dmlFusedNodeInputIndex]) + const uint32_t subgraphInputIndex = iter->second; + + // Either this edge will be + // a constant input, then it will be an intermediate edge and + // set the OWNED_BY_DML flag if it is large constant + // or, + // a non-constant input, then it will be a mainDmlGraphInputEdge. + if (subgraphInputIndex < isConstGpuGraphInputCount && + isConstGpuGraphInput[subgraphInputIndex]) { - // This is a highly inefficient approach to generating constant nodes. It duplicates constant data - // across the graph input as well as every consumer's unique constant node. However it is currently - // only used for small inputs. - uint32_t c_maxConstNodeDataSize = 8; - - - auto& operatorGraphInputNode = graphNodeCreateInfo.nodesAsOperatorDesc[operatorGraphInputEdge.ToNodeIndex]; - std::vector toNodeInputTensorDescs = operatorGraphInputNode->GetInputTensors(); - DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorGraphInputEdge.ToNodeInputIndex]; - ComPtr constantInput; - - if (tensorDesc->totalTensorSizeInBytes < c_maxConstNodeDataSize) - { - constantInput = constantCpuGraphInputGetter(arg->Name()); - } - - if (constantInput) - { - // The tensor description's size should be no larger than the constant input unless it was rounded to - // the required alignment. - assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes); - size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast(tensorDesc->totalTensorSizeInBytes)); - auto data = static_cast(constantInput->GetData()); - std::vector tensorData(data, data + minimumConstantSize); - - NodeInfo nodeInfo = {}; - nodeInfo.nodeDef = std::move(tensorData); - graphNodes.push_back(std::move(nodeInfo)); - - DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {}; - edge.FromNodeIndex = static_cast(graphNodes.size() - 1); - edge.FromNodeOutputIndex = 0; - edge.ToNodeIndex = mainGraphNodeIndex; - edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex; - graphIntermediateEdges.push_back(edge); - } - else + const auto& constantNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name()); + auto& constantNodeVariant = std::get(dmlGraphNodes[constantNodeAndIndex.nodeIndex].Desc); + if (std::holds_alternative(constantNodeVariant)) { - DML_INPUT_GRAPH_EDGE_DESC edge = {}; - edge.GraphInputIndex = dmlFusedNodeInputIndex; - edge.ToNodeIndex = mainGraphNodeIndex; - edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex; - graphInputEdges.push_back(edge); - + auto& mainDmlGraphNode = dmlGraphNodes[dmlGraphNodeIndex]; + AbstractOperatorDesc& abstractOperatorDesc = std::get(mainDmlGraphNode.Desc); + std::vector toNodeInputTensorDescs = abstractOperatorDesc.GetInputTensors(); + DmlBufferTensorDesc* tensorDesc = toNodeInputTensorDescs[operatorDmlGraphInputEdge.ToNodeInputIndex]; tensorDesc->flags |= DML_TENSOR_FLAG_OWNED_BY_DML; + serializedGraphLargeConstantNameToSubgraphInputIndex[arg->Name()] = subgraphInputIndex; } + + DmlIntermediateSerializedGraphEdge edge = {}; + edge.FromNodeIndex = constantNodeAndIndex.nodeIndex; + edge.FromNodeOutputIndex = constantNodeAndIndex.targetIndex; + edge.ToNodeIndex = dmlGraphNodeIndex; + edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex; + edge.Name = arg->Name() + "-nodeIdx:" + std::to_string(edge.FromNodeIndex) + "-outputIdx:" + std::to_string(edge.FromNodeOutputIndex); + dmlGraphIntermediateEdges.push_back(edge); } else { - DML_INPUT_GRAPH_EDGE_DESC edge = {}; - edge.GraphInputIndex = dmlFusedNodeInputIndex; - edge.ToNodeIndex = mainGraphNodeIndex; - edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex; - graphInputEdges.push_back(edge); + DmlInputSerializedGraphEdge edge = {}; + if (subGraphInputIndexToDmlGraphInputIndex.find(subgraphInputIndex) == subGraphInputIndexToDmlGraphInputIndex.end()) + { + subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex] = static_cast(subGraphInputIndexToDmlGraphInputIndex.size()); + } + + edge.GraphInputIndex = subGraphInputIndexToDmlGraphInputIndex[subgraphInputIndex]; + edge.ToNodeIndex = dmlGraphNodeIndex; + edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex; // ?? might need to point inputIndex + edge.Name = arg->Name(); + + serializedGraphInputIndexToSubgraphInputIndex[edge.GraphInputIndex] = subgraphInputIndex; + dmlGraphInputEdges.push_back(edge); } } else { - const auto& inputNodeAndIndex = nameToNodeAndIndexMap.at(arg->Name()); + const auto& inputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(arg->Name()); - DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {}; + DmlIntermediateSerializedGraphEdge edge = {}; edge.FromNodeIndex = inputNodeAndIndex.nodeIndex; edge.FromNodeOutputIndex = inputNodeAndIndex.targetIndex; - edge.ToNodeIndex = mainGraphNodeIndex; - edge.ToNodeInputIndex = operatorGraphInputEdge.ToNodeInputIndex; - graphIntermediateEdges.push_back(edge); + edge.ToNodeIndex = dmlGraphNodeIndex; + edge.ToNodeInputIndex = operatorDmlGraphInputEdge.ToNodeInputIndex; + edge.Name = arg->Name(); + dmlGraphIntermediateEdges.push_back(edge); } } } // map operatorGraphIntermediateEdges as mainGraphIntermediateEdge - for (auto& operatorGraphIntermediateEdge : graphNodeCreateInfo.intermediateEdges) + for (auto& operatorGraphIntermediateEdge : operatorDmlGraphCreateInfo.intermediateEdges) { - DML_INTERMEDIATE_GRAPH_EDGE_DESC edge = {}; - edge.FromNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.FromNodeIndex]; + DmlIntermediateSerializedGraphEdge edge = {}; + uint32_t shiftedFromNodeIndex = SetAndGetDmlGraphNodeIndex( + operatorGraphIntermediateEdge.FromNodeIndex, + node.Name(), + *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.FromNodeIndex], + operatorDmlGraphToDmlGraphNodeIndexMap, + dmlGraphNodes); + uint32_t shiftedToNodeIndex = SetAndGetDmlGraphNodeIndex( + operatorGraphIntermediateEdge.ToNodeIndex, + node.Name(), + *operatorDmlGraphCreateInfo.nodes[operatorGraphIntermediateEdge.ToNodeIndex], + operatorDmlGraphToDmlGraphNodeIndexMap, + dmlGraphNodes); + + edge.FromNodeIndex = shiftedFromNodeIndex; edge.FromNodeOutputIndex = operatorGraphIntermediateEdge.FromNodeOutputIndex; - edge.ToNodeIndex = operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphIntermediateEdge.ToNodeIndex]; + edge.ToNodeIndex = shiftedToNodeIndex; edge.ToNodeInputIndex = operatorGraphIntermediateEdge.ToNodeInputIndex; - graphIntermediateEdges.push_back(edge); + edge.Name = "nodeIdx:" + std::to_string(shiftedFromNodeIndex) + "-outputIdx:" + std::to_string(operatorGraphIntermediateEdge.FromNodeOutputIndex); + dmlGraphIntermediateEdges.push_back(edge); } - + // populate nameToNodeAndIndexMap (which will be used by above loop) for operatorGraphOutputEdges - for (auto& operatorGraphOutputEdge : graphNodeCreateInfo.outputEdges) + for (auto& operatorGraphOutputEdge : operatorDmlGraphCreateInfo.outputEdges) { const onnxruntime::NodeArg* arg = node.OutputDefs()[operatorGraphOutputEdge.GraphOutputIndex]; if (arg->Exists()) { - nameToNodeAndIndexMap[arg->Name()] = NodeAndIndex { - operatorGraphNodeIndexToMainGraphNodeIndexMap[operatorGraphOutputEdge.FromNodeIndex], - operatorGraphOutputEdge.FromNodeOutputIndex - }; - + uint32_t shiftedNodeIndex = SetAndGetDmlGraphNodeIndex( + operatorGraphOutputEdge.FromNodeIndex, + node.Name(), + *operatorDmlGraphCreateInfo.nodes[operatorGraphOutputEdge.FromNodeIndex], + operatorDmlGraphToDmlGraphNodeIndexMap, + dmlGraphNodes); + dmlGraphNodeOutputNameToNodeAndIndexMap[arg->Name()] = {shiftedNodeIndex, operatorGraphOutputEdge.FromNodeOutputIndex}; nodeOutputShapes[arg->Name()] = outputShapes; } } - - if (isNodeAsOpDesc) - { - for (size_t i = 0; i < graphNodeCreateInfo.nodesAsOperatorDesc.size(); ++i) - { - auto& opDesc = graphNodeCreateInfo.nodesAsOperatorDesc[i]; - - DML_OPERATOR_DESC dmlDesc = SchemaHelpers::ConvertOperatorDesc(*opDesc, &allocator); - - // TODO: Change as new header is ingested - if (dmlDesc.Type == (DML_OPERATOR_TYPE) DML_OPERATOR_QUANTIZED_LINEAR_AVERAGE_POOLING) - dmlDesc.Type = (DML_OPERATOR_TYPE) 169; - - // TODO: Change as new header is ingested - if (dmlDesc.Type == (DML_OPERATOR_TYPE) DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT) - dmlDesc.Type = (DML_OPERATOR_TYPE) 170; - - ComPtr op; - ORT_THROW_IF_FAILED(device->CreateOperator(&dmlDesc, IID_PPV_ARGS(&op))); - allocator.Reset(); - - NodeInfo nodeInfo = {}; - nodeInfo.nodeDef = std::move(op); - nodeInfo.name = node.Name(); - graphNodes[firstOpDescGraphNodeIndex + i] = std::move(nodeInfo); - } - } } EdgeShapes graphOutputShapes(subgraphOutputs.size()); @@ -476,24 +505,27 @@ namespace Dml::GraphDescBuilder const onnxruntime::NodeArg* graphOutput = subgraphOutputs[outputIndex]; ORT_THROW_HR_IF_NULL_MSG(E_POINTER, graphOutput, "FusedNode's nodeArgList does not contain one of the nodeArg"); - const auto& outputNodeAndIndex = nameToNodeAndIndexMap.at(graphOutput->Name()); + const auto& outputNodeAndIndex = dmlGraphNodeOutputNameToNodeAndIndexMap.at(graphOutput->Name()); - DML_OUTPUT_GRAPH_EDGE_DESC edge = {}; + DmlOutputSerializedGraphEdge edge = {}; edge.FromNodeIndex = outputNodeAndIndex.nodeIndex; edge.FromNodeOutputIndex = outputNodeAndIndex.targetIndex; edge.GraphOutputIndex = gsl::narrow_cast(outputIndex); - graphOutputEdges.push_back(edge); + edge.Name = graphOutput->Name(); + dmlGraphOutputEdges.push_back(edge); graphOutputShapes.GetMutableShape(outputIndex) = nodeOutputShapes[graphOutput->Name()].GetShape(outputNodeAndIndex.targetIndex); } - RemoveUnconnectedNodes(graphNodes, graphInputEdges, graphIntermediateEdges, graphOutputEdges); + RemoveUnconnectedNodes(dmlGraphNodes, dmlGraphInputEdges, dmlGraphIntermediateEdges, dmlGraphOutputEdges); GraphDesc graphDesc{}; - graphDesc.nodes = std::move(graphNodes); - graphDesc.inputEdges = std::move(graphInputEdges); - graphDesc.outputEdges = std::move(graphOutputEdges); - graphDesc.intermediateEdges = std::move(graphIntermediateEdges); - graphDesc.reuseCommandList = reuseCommandList; + graphDesc.InputCount = static_cast(dmlGraphInputEdges.size()); + graphDesc.OutputCount = static_cast(subgraphOutputs.size()); + graphDesc.Nodes = std::move(dmlGraphNodes); + graphDesc.InputEdges = std::move(dmlGraphInputEdges); + graphDesc.OutputEdges = std::move(dmlGraphOutputEdges); + graphDesc.IntermediateEdges = std::move(dmlGraphIntermediateEdges); + graphDesc.reuseCommandList = (subgraphNodes.size() >= minNodeCountToReuseCommandList || executionHandle->IsMcdmDevice()); graphDesc.outputShapes = std::move(graphOutputShapes); return graphDesc; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h index c95e89b45541b..4055984b40405 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h @@ -22,22 +22,15 @@ namespace Dml namespace GraphDescBuilder { + constexpr uint32_t minNodeCountToReuseCommandList = 5; + constexpr uint32_t c_maxConstNodeDataSize = 8; + // Gets a unique name for the node which survives recreation and graph manipulations between the point // that graph partitioning occurs and kernel creation happens const std::string& GetUniqueNodeName(const onnxruntime::Node& node); - struct NodeInfo - { - std::variant, std::vector> nodeDef; - std::string name; - }; - - struct GraphDesc + struct GraphDesc : DmlSerializedGraphDesc { - std::vector nodes; - std::vector inputEdges; - std::vector outputEdges; - std::vector intermediateEdges; bool reuseCommandList; Windows::AI::MachineLearning::Adapter::EdgeShapes outputShapes; }; @@ -47,11 +40,13 @@ namespace Dml const size_t isConstGpuGraphInputCount, const std::unordered_map>& isInitializerTransferable, const std::unordered_map& graphNodePropertyMap, - IDMLDevice* device, const ExecutionProviderImpl* executionHandle, const onnxruntime::Path& modelPath, gsl::span subgraphNodes, gsl::span subgraphInputs, - gsl::span subgraphOutputs); + gsl::span subgraphOutputs, + /*out*/ std::unordered_map& serializedGraphInputIndexToSubgraphInputIndex, + /*out*/ std::unordered_map& serializedGraphLargeConstantNameToSubgraphInputIndex, + /*out*/ std::vector>& smallConstantData); } } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp index d524780de71b8..f29fbc7a1a65b 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp @@ -1508,31 +1508,17 @@ namespace Windows::AI::MachineLearning::Adapter ORT_TRY { assert(operatorGraphDesc != nullptr); - // Either nodesAsOpDesc or nodesIDMLOperator can be present. - assert(operatorGraphDesc->nodeCount == 0 || (!operatorGraphDesc->nodesAsOpDesc ^ !operatorGraphDesc->nodesAsIDMLOperator)); + assert(operatorGraphDesc->nodeCount == 0 || operatorGraphDesc->nodes); - if (operatorGraphDesc->nodesAsOpDesc) + m_graphNodeCreateInfo->nodes = std::vector>(); + for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) { - m_graphNodeCreateInfo->nodesAsOperatorDesc = std::vector>(); - for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) - { - auto* node = operatorGraphDesc->nodesAsOpDesc[nodeIndex]; - assert(node != nullptr); - AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node); - m_graphNodeCreateInfo->nodesAsOperatorDesc.push_back(std::make_unique(std::move(abstractDesc))); - } - } - else - { - m_graphNodeCreateInfo->nodesAsIDMLOperator = std::vector>(); - for (uint32_t nodeIndex = 0; nodeIndex < operatorGraphDesc->nodeCount; nodeIndex++) - { - auto* node = operatorGraphDesc->nodesAsIDMLOperator[nodeIndex]; - assert(node != nullptr); - m_graphNodeCreateInfo->nodesAsIDMLOperator.push_back(node); - } + auto* node = operatorGraphDesc->nodes[nodeIndex]; + assert(node != nullptr); + AbstractOperatorDesc abstractDesc = SchemaHelpers::ConvertOperatorDesc(*node); + m_graphNodeCreateInfo->nodes.push_back(std::make_unique(std::move(abstractDesc))); } - + // There can be operators (or kernels) which don't require any input. assert(operatorGraphDesc->inputEdgeCount == 0 || operatorGraphDesc->inputEdges != nullptr); m_graphNodeCreateInfo->inputEdges.insert( diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp index c3bb1a52210f5..287f1e5b6dfe7 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperator.cpp @@ -53,7 +53,7 @@ namespace Dml MLOperatorGraphDesc operatorGraphDesc = {}; operatorGraphDesc.nodeCount = 1; const DML_OPERATOR_DESC* opDescs{&operatorDesc}; - operatorGraphDesc.nodesAsOpDesc = &opDescs; + operatorGraphDesc.nodes = &opDescs; std::vector inputEdges; for (uint32_t inputIndex = 0; inputIndex < m_kernelInputIndices.size(); inputIndex++) @@ -796,7 +796,7 @@ namespace Dml for (size_t i = 0; i < graphDesc.NodeCount; ++i) { // Create the operator. - ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodesAsOpDesc[i], IID_PPV_ARGS(&dmlOperators[i]))); + ORT_THROW_IF_FAILED(m_dmlDevice->CreateOperator(operatorGraphDesc.nodes[i], IID_PPV_ARGS(&dmlOperators[i]))); dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{dmlOperators[i].Get()}; dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]}; } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp index c8ca6806e75f7..73c2d57e984af 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp @@ -531,7 +531,7 @@ class DmlOperatorAttention : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp index 1c851c94c4ddc..5aceebbdabfe3 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasAdd.cpp @@ -103,7 +103,7 @@ class DmlOperatorBiasAdd : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp index 501ce14f1fc08..1e10214ffd463 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorBiasSplitGelu.cpp @@ -137,7 +137,7 @@ class DmlOperatorBiasSplitGelu : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp index 6a8333cd72561..3c9458658c4d0 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorEmbedLayerNormalization.cpp @@ -484,7 +484,7 @@ class DmlOperatorEmbedLayerNormalization : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp index fed0e4645ffd8..8b275fc550f3e 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorGroupNorm.cpp @@ -287,7 +287,7 @@ class DmlOperatorGroupNorm : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp index 5c64059f7caa9..80e6fefc2fb80 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorLayerNormalization.cpp @@ -247,7 +247,7 @@ class DmlOperatorLayerNormalization : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp index c97b03dc36b62..8727610ff3112 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConcat.cpp @@ -166,7 +166,7 @@ class DmlOperatorQLinearConcat : public DmlOperator, public QLinearConcatHelper MLOperatorGraphDesc operatorGraphDesc = {}; operatorGraphDesc.nodeCount = static_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); uint32_t joinNodeIndex = operatorGraphDesc.nodeCount - 2; uint32_t quantizeNodeIndex = operatorGraphDesc.nodeCount - 1; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp index 35f926d62c92a..f658e7c7da323 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearSigmoid.cpp @@ -113,7 +113,7 @@ class DmlOperatorQLinearSigmoid : public DmlOperator MLOperatorGraphDesc operatorGraphDesc = {}; operatorGraphDesc.nodeCount = 3; std::vector opDescs{&opDesc1, &opDesc2, &opDesc3}; - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); // set input edges std::pair nodeToNodeInputIndex[5] {{0, 0}, {0, 1}, {0, 2}, {2, 1}, {2, 2}}; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp index 3683ab7b0b0b3..e62b7d707ba78 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQuickGelu.cpp @@ -123,7 +123,7 @@ class DmlOperatorQuickGelu : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } }; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp index 44004b5d77f70..0f15ebf342b3a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorRotaryEmbedding.cpp @@ -441,7 +441,7 @@ class DmlOperatorRotaryEmbedding : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelInfo); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp index 4dafd78f21ea8..094c45a0e38e5 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp @@ -198,7 +198,7 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size()); operatorGraphDesc.outputEdges = outputEdges.data(); operatorGraphDesc.nodeCount = gsl::narrow_cast(opDescs.size()); - operatorGraphDesc.nodesAsOpDesc = opDescs.data(); + operatorGraphDesc.nodes = opDescs.data(); SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h new file mode 100644 index 0000000000000..02166f992449e --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Utility.h @@ -0,0 +1,141 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include +#include +#include + + +namespace Dml +{ + static inline std::wstring ConvertToWString(std::string_view str) + { + std::wstring_convert,wchar_t> g_converterToUtf16; + return g_converterToUtf16.from_bytes(str.data()); + } + + static inline std::wstring GetModelName(const onnxruntime::Path& modelPath) + { + if (modelPath.GetComponents().empty()) + { + return L""; + } + + const onnxruntime::PathString& pathString = modelPath.GetComponents().back(); + size_t dotPosition = pathString.find_last_of('.'); + if (dotPosition == std::string::npos) + { + return L""; + } + + return pathString.substr(0, dotPosition); + } + + static inline std::wstring GetSanitizedFileName(std::wstring_view name) + { + std::wstring newName(name); + for (wchar_t& c : newName) + { + switch (c) + { + case '\\': + case '/': + case '\"': + case '|': + case '<': + case '>': + case ':': + case '?': + case '*': + c = '_'; + break; + } + } + return newName; + } + + static inline std::string GetSanitizedFileName(std::string_view name) + { + std::string newName(name); + for (char& c : newName) + { + switch (c) + { + case '\\': + case '/': + case '\"': + case '|': + case '<': + case '>': + case ':': + case '?': + case '*': + c = '_'; + break; + } + } + return newName; + } + + static inline void WriteToFile(std::wstring_view directoryName, std::wstring_view fileName, std::uint8_t* data, size_t dataSize) + { + std::wstring sanitizedFileName = GetSanitizedFileName(fileName); + std::filesystem::create_directory(directoryName); + std::wstring fullSanitizedFileName = std::wstring(directoryName) + + (directoryName.empty() ? L"" : L"/") + + sanitizedFileName; + std::ofstream file(fullSanitizedFileName, std::ios::binary); + if (!file.is_open()) + { + std::wstring_convert,wchar_t> g_converterToUtf16; + std::stringstream errorMessage; + errorMessage << "File named: " << g_converterToUtf16.to_bytes(fileName.data()) << " could not be opened\n"; + throw std::ios::failure(errorMessage.str()); + } + file.write(reinterpret_cast(data), dataSize); + } + +} + +namespace StringUtil +{ + struct NameAndIndex + { + const char* name; // Null terminated. + uint32_t index; + }; + + struct WideNameAndIndex + { + const wchar_t* name; // Null terminated. + uint32_t index; + }; + + inline std::optional MapToIndex(std::string_view mode, gsl::span nameAndIndexList) + { + for (auto& nameAndIndex : nameAndIndexList) + { + if (strncmp(nameAndIndex.name, mode.data(), mode.size()) == 0) + { + return nameAndIndex.index; + } + } + + return {}; + } + + inline std::optional MapToIndex(std::wstring_view mode, gsl::span nameAndIndexList) + { + for (auto& nameAndIndex : nameAndIndexList) + { + if (wcsncmp(nameAndIndex.name, mode.data(), mode.size()) == 0) + { + return nameAndIndex.index; + } + } + + return {}; + } +} \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h index 83737d2ba4848..332bf86685e8a 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include @@ -37,6 +39,7 @@ #include #include "External/D3DX12/d3dx12.h" #endif +#include "flatbuffers/flatbuffers.h" #include "GraphicsUnknownHelper.h" @@ -53,6 +56,9 @@ #include "External/DirectMLHelpers/SchemaHelpers.h" #include "External/DirectMLHelpers/GeneratedSchemaHelpers.h" #include "External/DirectMLHelpers/DirectMLX.h" +#include "External/DirectMLHelpers/DmlSerializedGraphDesc.h" +#include "External/DirectMLHelpers/DmlGraphSerialization.h" +#include "External/DirectMLHelpers/DmlGraphDeserialization.h" using Microsoft::WRL::ComPtr; @@ -67,3 +73,4 @@ using Microsoft::WRL::ComPtr; #include "TensorDesc.h" #include "DescriptorPool.h" #include "IExecutionProvider.h" +#include "Utility.h" \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h index 3bec8d3864cba..ac3a3eb1268b8 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/MLOperatorAuthorPrivate.h @@ -10,18 +10,11 @@ struct DML_INPUT_GRAPH_EDGE_DESC; struct DML_OUTPUT_GRAPH_EDGE_DESC; struct DML_INTERMEDIATE_GRAPH_EDGE_DESC; -// Either nodesAsOpDesc or nodesAsIDMLOperator is present. -// 1) Operator kernels which implement operators using only a single DML operator will pass a DML_OPERATOR_DESC. -// These kernels pass DML_OPERATOR_DESC, because while building Dml graph (inside FusedGraphKernel.cpp) we can change the -// the flag of constant inputs to DML_TENSOR_FLAG_OWNED_BY_DML. -// 2) Operator kernels which implement operators using DMLX graph, they will pass IDMLOperator and won't be able -// to use DML_TENSOR_FLAG_OWNED_BY_DML. struct MLOperatorGraphDesc { uint32_t nodeCount; - _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodesAsOpDesc; - _Field_size_opt_(nodeCount) IDMLOperator** nodesAsIDMLOperator; - + _Field_size_opt_(nodeCount) const DML_OPERATOR_DESC** nodes; + uint32_t inputEdgeCount; _Field_size_(inputEdgeCount) const DML_INPUT_GRAPH_EDGE_DESC* inputEdges; diff --git a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h index d11fa7516e713..5b5f371f51616 100644 --- a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h +++ b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h @@ -21,3 +21,4 @@ // "1": disabled (disallowed). Graph fusion will never be used. // The default value is "0" static const char* const kOrtSessionOptionsConfigDisableDmlGraphFusion = "ep.dml.disable_graph_fusion"; +static const char* const kOrtSessionOptionsConfigEnableGraphSerialization = "ep.dml.enable_graph_serialization"; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index efd7db4ea7629..5fd66c459d382 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1725,10 +1725,17 @@ common::Status InferenceSession::Initialize() { // graph optimization level and is generally always applied. bool dml_graph_fusion_enabled = session_options_.optimized_model_filepath.empty() && session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisableDmlGraphFusion, "0") == "0"; + std::string dml_graph_serialization_enabled_config_val = session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigEnableGraphSerialization, "0"); + std::transform(dml_graph_serialization_enabled_config_val.begin(), + dml_graph_serialization_enabled_config_val.end(), + dml_graph_serialization_enabled_config_val.begin(), + [](char ch) { return std::tolower(ch); }); + bool dml_graph_serialization_enabled = dml_graph_serialization_enabled_config_val == "true"; if (dml_graph_fusion_enabled) { std::unique_ptr dmlGraphFusionTransformer = std::make_unique("DmlGraphFusionTransformer", - dmlExecutionProvider); + dmlExecutionProvider, + dml_graph_serialization_enabled); if (dmlGraphFusionTransformer == nullptr) { return Status(common::ONNXRUNTIME, common::FAIL, "DmlGraphFusionTransformer is nullptr"); } diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 3874901f86387..7d4111e3b9c39 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -68,6 +68,7 @@ namespace perftest { "\t [DML only] [device_filter]: DML device filter, options: 'any', 'gpu', 'npu', \n" "\t [DML only] [disable_metacommands]: Options: 'true', 'false', \n" "\t [DML only] [enable_dynamic_graph_fusion]: Options: 'true', 'false', \n" + "\t [DML only] [enable_graph_serialization]: Options: 'true', 'false', \n" "\t [OpenVINO only] [device_type]: Overrides the accelerator hardware type and precision with these values at runtime.\n" "\t [OpenVINO only] [device_id]: Selects a particular hardware device for inference.\n" "\t [OpenVINO only] [enable_npu_fast_compile]: Optionally enabled to speeds up the model's compilation on NPU device targets.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 87506c7240578..1934314b8ce43 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -18,6 +18,7 @@ #ifdef USE_DML #include "core/providers/dml/dml_provider_factory.h" +#include "core/providers/dml/dml_session_options_config_keys.h" #endif #ifdef _WIN32 @@ -542,6 +543,15 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); "[ERROR] [DML] You have selcted wrong value for the key 'enable_dynamic_graph_fusion'. " "Select from 'true' or 'false' \n"); } + } else if (key == "enable_graph_serialization") { + std::set ov_supported_values = {"true", "True", "false", "False"}; + if (ov_supported_values.find(value) != ov_supported_values.end()) { + session_options.AddConfigEntry(kOrtSessionOptionsConfigEnableGraphSerialization, value.data()); + } else { + ORT_THROW( + "[ERROR] [DML] You have selcted wrong value for the key 'enable_graph_serialization'. " + "Select from 'true' or 'false' \n"); + } } } session_options.AppendExecutionProvider("DML", dml_options); From 8bd943be39301639e3f50f524f8fd71c7f2b2a34 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Tue, 27 Feb 2024 09:31:32 +1000 Subject: [PATCH 51/68] Retry flaky XCode iOS UI tests if we get a known error (#19639) ### Description Xcode UI tests seem to be flaky: https://github.com/orgs/community/discussions/68807 Add a couple of retries if we get a "Timed out while loading Accessibility." error which is transient. ### Motivation and Context --- .../github/apple/test_apple_packages.py | 61 ++++++++++++++----- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py index cd360a63a3a0f..3c0df994ffd3d 100644 --- a/tools/ci_build/github/apple/test_apple_packages.py +++ b/tools/ci_build/github/apple/test_apple_packages.py @@ -130,22 +130,51 @@ def _test_apple_packages(args): simulator_device_info = json.loads(simulator_device_info) - subprocess.run( - [ - "xcrun", - "xcodebuild", - "test", - "-workspace", - "./apple_package_test.xcworkspace", - "-scheme", - "ios_package_test", - "-destination", - f"platform=iOS Simulator,id={simulator_device_info['device_udid']}", - ], - shell=False, - check=True, - cwd=target_proj_path, - ) + # Xcode UI tests seem to be flaky: https://github.com/orgs/community/discussions/68807 + # Add a couple of retries if we get this error: + # ios_package_testUITests-Runner Failed to initialize for UI testing: + # Error Domain=com.apple.dt.XCTest.XCTFuture Code=1000 "Timed out while loading Accessibility." + attempts = 0 + cmd = [ + "xcrun", + "xcodebuild", + "test", + "-workspace", + "./apple_package_test.xcworkspace", + "-scheme", + "ios_package_test", + "-destination", + f"platform=iOS Simulator,id={simulator_device_info['device_udid']}", + ] + + while True: + attempts += 1 + completed_process = subprocess.run( + cmd, + shell=False, + capture_output=True, + check=False, + text=True, + cwd=target_proj_path, + ) + + # print so it's in CI output + print(completed_process.stdout) + + if completed_process.returncode != 0: + print(f"Running ios_package_test failed. Return code was {completed_process.returncode}") + print("xcrun xcodebuild test stderr:") + print(completed_process.stderr) + print("---") + + if "Timed out while loading Accessibility" in completed_process.stderr and attempts < 3: + continue + + raise subprocess.CalledProcessError( + completed_process.returncode, " ".join(cmd), completed_process.stdout, completed_process.stderr + ) + + break if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test: subprocess.run( From 18c8fab1ae03e68a906fe42698ac322e9e49e218 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Mon, 26 Feb 2024 15:58:09 -0800 Subject: [PATCH 52/68] Fix a bug in build.py (#19652) ### Description Fix a bug in build.py that accidentally disabled C# tests for most builds when "--build_nuget" is specified. ### Motivation and Context The bug was introduced in PR #8892 . --- tools/ci_build/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 5b715bb29e5a1..74c473d34f548 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2592,7 +2592,7 @@ def main(): raise BuildError("Using --get-api-doc requires a single build config") # Disabling unit tests for GPU on nuget creation - if args.use_openvino != "CPU_FP32" and args.build_nuget: + if args.use_openvino and args.use_openvino != "CPU_FP32" and args.build_nuget: args.test = False # GDK builds don't support testing From 8a71b657654d63437267014b324bf124a80de347 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Tue, 27 Feb 2024 11:35:27 +1000 Subject: [PATCH 53/68] Remove skipping of Reshape from NNAPI EP (#19618) ### Description A number of Qualcomm Snapdragon chipsets do not produce correct output if we skip the Reshape, which ironically was a performance optimization for Snapdragon chips. Perf testing showed that Squeeze also seems to execute on CPU so there's no benefit to using that as an alternative where possible e.g. Global*Pool -> Reshape to 2D -> Gemm could be potentially be replaced with Global*Pool -> Squeeze dims 2 and 3 -> Gemm if that offered better performance. ### Motivation and Context #19518 --- .../builders/op_builder_helpers.cc | 30 ++++++++++++++----- .../builders/op_builder_helpers.h | 3 -- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc index a066c64dac67d..466865f23f49a 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.cc @@ -965,6 +965,18 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit, return Status::OK(); } +// NOTE: Skipping Reshape results in invalid output on some SnapDragon chipsets. Whilst the NNAPI spec says the input +// to FullyConnnected can be > 2D, those chipsets don't handle this correctly. +// +// CanSkipReshape could potentially be re-enabled in the future if we no longer want to support those old chipsets. +// However, the Reshape of newer chipsets may not run on CPU so there may not be a performance issue to try and avoid, +// so CanSkipReshape could be redundant anyway. +// +// Known bad chipsets: Qualcomm Snapdragon 850, 855, 865, 870. +// +// See https://github.com/microsoft/onnxruntime/issues/19518 + +/* // We can skip the Reshape if all the output edges satisfies both the following conditions // 1. The output of the reshape/flatten is not an output of the graph // 2. The output of the reshape/flatten is the input 0 of one or more GEMM/Matmul operators, @@ -977,7 +989,7 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit, // between NNAPI CPU impl and Hardware Accelerator impl and will speed up the execution // If we are going to skip the reshape, we will still add correct shape and operand type for the output in // onnxruntime::nnapi::Model. -bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit, +static bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit, size_t input_rank, size_t output_rank) { // Since we know this is a Reshape NodeUnit, so we can safely assume there is only 1 output // and the node_unit has only one output node. @@ -1039,33 +1051,37 @@ bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit << node_unit.Name() << "] with output, " << output_name; return true; } +*/ Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit, const std::string& input, const std::vector& shape) { auto& shaper(model_builder.GetShaper()); - const auto& operand_indices(model_builder.GetOperandIndices()); const auto& operand_types(model_builder.GetOperandTypes()); const auto& output = node_unit.Outputs()[0].node_arg.Name(); const auto input_shape = shaper[input]; const auto output_shape = shaper[output]; - const auto input_rank = input_shape.size(); - const auto output_rank = output_shape.size(); // For reshape, the output type should be the same as the input type except the shape is different auto output_operand_type = operand_types.at(input); output_operand_type.SetDimensions(output_shape); + /* See CanSkipReshape definition above for explanation of why this is disabled. // Since Reshape is not running using hardware in NNAPI for some CPU (e.g. Qualcomm SD for now) // We will try to see if we the skip the Reshape to prevent context switching between // NNAPI CPU impl and NNAPI hardware accelerator impl if (CanSkipReshape(model_builder, node_unit, input_rank, output_rank)) { - // Since reshape can be skipped, only register the dimension and type, with same index and new name + const auto& operand_indices(model_builder.GetOperandIndices()); + const auto input_rank = input_shape.size(); + const auto output_rank = output_shape.size(); + // Since reshape can be skipped, only register the dimension and type, with same index and new name. + // This essentially redirects the downstream operator builders to the input of the skipped Reshape node, + // but with the output shape of the Reshape node. model_builder.RegisterOperand(output, operand_indices.at(input), output_operand_type); - } else { - // We still need to perform a reshape here + } else */ + { std::string shape_name = model_builder.GetUniqueName(node_unit.Name() + input + "newshape"); ORT_RETURN_IF_ERROR(op_builder_helpers::AddNnapiReshape(model_builder, input, shape_name, shape, output)); } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h index 7ccf4c1ef7555..61a16ceff752f 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder_helpers.h @@ -181,9 +181,6 @@ Status AddMinMaxOperator(ModelBuilder& model_builder, const NodeUnit& node_unit, Status AddReshapeOperator(ModelBuilder& model_builder, const NodeUnit& node_unit, const std::string& input, const std::vector& shape); -bool CanSkipReshape(const ModelBuilder& model_builder, const NodeUnit& node_unit, - size_t input_rank, size_t output_rank); - Status GetAxesForSqueezeAndUnSqueeze(ModelBuilder& model_builder, const NodeUnit& node_unit, std::vector& axes); From 6f566562cedff9996e55dbf623b1f0141733d52c Mon Sep 17 00:00:00 2001 From: kailums <109063327+kailums@users.noreply.github.com> Date: Tue, 27 Feb 2024 11:31:03 +0800 Subject: [PATCH 54/68] support user_compute_stream for rocm ep (#19619) ### Description According to the pr #19229 supporting cuda EP use external compute stream, we add support for rocm EP. And when we testing this feature with torch, we found torch use stream 0 for the default stream, and `torch.cuda.current_stream()` returns `0` for current stream, but ort treat `0` or `nullptr` as invalid, and reset has_user_compute_stream to false. Will remove has_user_compute_stream option in the future. ### Motivation and Context The motivation for this pr is that we want to use torch.cuda.graph to capture ort running kernel, which requires torch and ort are running in the same stream, so we use this API to set ort's working stream. --- .../rocm/rocm_execution_provider_info.cc | 20 +++++++++++++++++++ .../test/python/onnxruntime_test_python.py | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc index b557f92287f2b..3cb826437a54f 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider_info.cc @@ -13,6 +13,8 @@ namespace onnxruntime { namespace rocm { namespace provider_option_names { constexpr const char* kDeviceId = "device_id"; +constexpr const char* kHasUserComputeStream = "has_user_compute_stream"; +constexpr const char* kUserComputeStream = "user_compute_stream"; constexpr const char* kMemLimit = "gpu_mem_limit"; constexpr const char* kArenaExtendStrategy = "arena_extend_strategy"; constexpr const char* kMiopenConvExhaustiveSearch = "miopen_conv_exhaustive_search"; @@ -38,6 +40,7 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P void* alloc = nullptr; void* free = nullptr; void* empty_cache = nullptr; + void* user_compute_stream = nullptr; ORT_THROW_IF_ERROR( ProviderOptionsParser{} .AddValueParser( @@ -52,6 +55,15 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P ", must be between 0 (inclusive) and ", num_devices, " (exclusive)."); return Status::OK(); }) + .AddAssignmentToReference(rocm::provider_option_names::kHasUserComputeStream, info.has_user_compute_stream) + .AddValueParser( + rocm::provider_option_names::kUserComputeStream, + [&user_compute_stream](const std::string& value_str) -> Status { + size_t address; + ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address)); + user_compute_stream = reinterpret_cast(address); + return Status::OK(); + }) .AddValueParser( rocm::provider_option_names::kGpuExternalAlloc, [&alloc](const std::string& value_str) -> Status { @@ -108,12 +120,18 @@ ROCMExecutionProviderInfo ROCMExecutionProviderInfo::FromProviderOptions(const P ROCMExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache}; info.external_allocator_info = alloc_info; + + info.user_compute_stream = user_compute_stream; + info.has_user_compute_stream = (user_compute_stream != nullptr); + return info; } ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecutionProviderInfo& info) { const ProviderOptions options{ {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)}, + {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)}, + {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast(info.user_compute_stream))}, {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)}, {rocm::provider_option_names::kGpuExternalAlloc, MakeStringWithClassicLocale(reinterpret_cast(info.external_allocator_info.alloc))}, {rocm::provider_option_names::kGpuExternalFree, MakeStringWithClassicLocale(reinterpret_cast(info.external_allocator_info.free))}, @@ -135,6 +153,8 @@ ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const ROCMExecution ProviderOptions ROCMExecutionProviderInfo::ToProviderOptions(const OrtROCMProviderOptions& info) { const ProviderOptions options{ {rocm::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)}, + {rocm::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)}, + {rocm::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast(info.user_compute_stream))}, {rocm::provider_option_names::kMemLimit, MakeStringWithClassicLocale(info.gpu_mem_limit)}, {rocm::provider_option_names::kArenaExtendStrategy, EnumToName(arena_extend_strategy_mapping, static_cast(info.arena_extend_strategy))}, {rocm::provider_option_names::kMiopenConvExhaustiveSearch, MakeStringWithClassicLocale(info.miopen_conv_exhaustive_search)}, diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py index 91b6c71e735a8..ab56f3fa0f37f 100644 --- a/onnxruntime/test/python/onnxruntime_test_python.py +++ b/onnxruntime/test/python/onnxruntime_test_python.py @@ -559,6 +559,16 @@ def test_get_and_set_option_with_values(option_name, option_values): test_get_and_set_option_with_values("enable_hip_graph", ["1", "0"]) + # test for user_compute_stream + option = options["ROCMExecutionProvider"] + option["user_compute_stream"] = "1" + sess.set_providers(["ROCMExecutionProvider"], [option]) + new_options = sess.get_provider_options() + new_option = new_options["ROCMExecutionProvider"] + self.assertEqual(new_option["user_compute_stream"], "1") + # set user_compute_stream will set has_user_compute_stream to 1 too + self.assertEqual(new_option["has_user_compute_stream"], "1") + run_rocm_options_test() def test_invalid_set_providers(self): From 5bb58a10e739f8720e9867d19c4313081b12d948 Mon Sep 17 00:00:00 2001 From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com> Date: Mon, 26 Feb 2024 20:00:14 -0800 Subject: [PATCH 55/68] Enable the most verbose logging level in detox E2E React Native CI (#19659) ### Description The RN CI has intermittent failure error with "app seems to idle". enable the most verbose logging level (and can add steps to dump device.log from the detox folder/artifacts if necessary) to at least get more information. ### Motivation and Context --------- Co-authored-by: rachguo --- .../github/azure-pipelines/templates/react-native-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index 47cd72f412c67..1b7962059e301 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -279,7 +279,7 @@ stages: - script: | JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/android-test-results.xml \ - detox test --record-logs all --configuration android.emu.release + detox test --record-logs all --configuration android.emu.release --loglevel trace workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e' displayName: Run React Native Detox Android e2e Tests @@ -329,7 +329,7 @@ stages: - script: | JEST_JUNIT_OUTPUT_FILE=$(Build.SourcesDirectory)/js/react_native/e2e/ios-test-results.xml \ - detox test --record-logs all --configuration ios.sim.release + detox test --record-logs all --configuration ios.sim.release --loglevel trace workingDirectory: '$(Build.SourcesDirectory)/js/react_native/e2e' displayName: Run React Native Detox iOS e2e Tests From 9e19684944adfda4a414fc91a67259894fce2898 Mon Sep 17 00:00:00 2001 From: duanshengliu <44742794+duanshengliu@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:56:32 +0800 Subject: [PATCH 56/68] Fix the TypeError issue in quantize.py (#19459) ### Description Fix related bug as described in https://github.com/microsoft/onnxruntime/issues/19430 --- onnxruntime/python/tools/quantization/quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 1bd2ef42151d0..05d3ac248c92c 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -479,7 +479,7 @@ def inc_dataloader(): del dataloader model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)) sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.") - model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix() + model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix() model.save(model_input) nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes]) model = load_model_with_shape_infer(Path(model_input)) # use smooth quant model for calibration From 1e69b612382205b0588f08d2b808b12e32a50a51 Mon Sep 17 00:00:00 2001 From: cloudhan Date: Tue, 27 Feb 2024 16:06:06 +0800 Subject: [PATCH 57/68] Make version string detection more robust (#19615) `/opt/rocm/.info/version-dev` is only available if the `rocm-dev` metapackage is installed. This will bring a lot of unused packages which are not needed by the users, they may opt for fine grained control. Fallback to `rocm_version.h` in case `rocm-dev` is not installed. --- cmake/CMakeLists.txt | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index ed9043f2adc4a..1376c90fbcefe 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -324,15 +324,27 @@ if (onnxruntime_USE_ROCM) endif() # replicate strategy used by pytorch to get ROCM_VERSION - # https://github.com/pytorch/pytorch/blob/8eb21488fdcdb8b0e6fa2e46179b5fa6c42e75af/cmake/public/LoadHIP.cmake#L153-L173 - file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW) - string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_DEV_MATCH ${ROCM_VERSION_DEV_RAW}) - if (ROCM_VERSION_DEV_MATCH) + # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake + # with modification + if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version-dev") + file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW) + string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW}) + elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h") + file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW) + string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW}) + elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h") + file(READ "${onnxruntime_ROCM_HOME}/include/rocm-core/rocm_version.h" ROCM_VERSION_H_RAW) + string(REGEX MATCH "\"([0-9]+)\.([0-9]+)\.([0-9]+).*\"" ROCM_VERSION_MATCH ${ROCM_VERSION_H_RAW}) + endif() + + if (ROCM_VERSION_MATCH) set(ROCM_VERSION_DEV_MAJOR ${CMAKE_MATCH_1}) set(ROCM_VERSION_DEV_MINOR ${CMAKE_MATCH_2}) set(ROCM_VERSION_DEV_PATCH ${CMAKE_MATCH_3}) set(ROCM_VERSION_DEV "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}.${ROCM_VERSION_DEV_PATCH}") math(EXPR ROCM_VERSION_DEV_INT "(${ROCM_VERSION_DEV_MAJOR}*10000) + (${ROCM_VERSION_DEV_MINOR}*100) + ${ROCM_VERSION_DEV_PATCH}") + else() + message(FATAL_ERROR "Cannot determine ROCm version string") endif() message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n") message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}") From 4838cb6b3e98273fcdd6a3e54da74cd584167780 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Tue, 27 Feb 2024 02:27:35 -0800 Subject: [PATCH 58/68] [QNN Quantization] Ensure fused nodes have names (#19650) ### Description - Updates the `qnn_preprocess_model()` method to set a name for any new nodes added to the graph (due to fusion). - Updates the `qnn_preprocess_model()` method to set a name for any unnamed nodes that previously existed in the original graph. - Adds unit tests for fusions (previously missing) - Checks that fused node names exist and are unique - Checks that fused graph is equivalent to original graph ### Motivation and Context Nodes are not strictly required to have names. However, a planned/upcoming feature to support mixed-precision (integer) quantized models needs nodes to have names. --- .../execution_providers/qnn/fusion_lpnorm.py | 7 +- .../execution_providers/qnn/preprocess.py | 11 + .../tools/quantization/fusions/fusion.py | 15 + .../tools/quantization/fusions/fusion_gelu.py | 25 +- .../quantization/fusions/fusion_layernorm.py | 1 + .../python/tools/quantization/onnx_model.py | 17 + .../test/python/quantization/test_fusions.py | 401 ++++++++++++++++++ 7 files changed, 465 insertions(+), 12 deletions(-) create mode 100644 onnxruntime/test/python/quantization/test_fusions.py diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py index 9ebf400498e0e..fbf954febdda4 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/fusion_lpnorm.py @@ -122,6 +122,11 @@ def fuse( self.nodes_to_remove.extend(subgraph_nodes) fused_node = onnx.helper.make_node( - self.fused_op_type, inputs=[subgraph_input], outputs=[subgraph_output], p=2, axis=-1 + self.fused_op_type, + name=self.create_unique_node_name(), + inputs=[subgraph_input], + outputs=[subgraph_output], + p=2, + axis=-1, ) self.nodes_to_add.append(fused_node) diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py index becbaceab184e..b1c114fe1f9fd 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py @@ -44,6 +44,17 @@ def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm: if fusion_layernorm.apply(): modified = True + # Make sure all nodes have a name. + unnamed_node_prefix = "qnn_preproc_node_" + available_suffix = onnx_model.get_largest_node_name_suffix(unnamed_node_prefix) + 1 + for node in onnx_model.model.graph.node: + if node.op_type != "Constant" and not node.name: + new_node_name = f"{unnamed_node_prefix}{available_suffix!s}" + available_suffix += 1 + node.name = new_node_name + modified = True + logging.warning(f"Node of type {node.op_type} does not have a name. Renamed to {new_node_name}.") + if modified: onnx_model.topological_sort() onnx.save_model(model, model_output) diff --git a/onnxruntime/python/tools/quantization/fusions/fusion.py b/onnxruntime/python/tools/quantization/fusions/fusion.py index b54b421226f1a..4bdc5c26cc946 100644 --- a/onnxruntime/python/tools/quantization/fusions/fusion.py +++ b/onnxruntime/python/tools/quantization/fusions/fusion.py @@ -24,6 +24,9 @@ def __init__(self, model: ONNXModel, fused_op_type: str, search_op_type: str): self.nodes_to_remove: list = [] self.nodes_to_add: list = [] + self._new_node_name_prefix = self.fused_op_type + "_fused_" + self.search_op_type + "_" + self._new_node_name_suffix = None # int|None used to create unique node names for the fused ops. + def fuse( self, node: onnx.NodeProto, @@ -57,6 +60,18 @@ def apply(self) -> bool: return graph_updated + def create_unique_node_name(self): + prefix = self._new_node_name_prefix + + if self._new_node_name_suffix is None: + largest_suffix: int = self.model.get_largest_node_name_suffix(prefix) + self._new_node_name_suffix = largest_suffix + 1 + + new_name = f"{prefix}{self._new_node_name_suffix!s}" + self._new_node_name_suffix += 1 + + return new_name + @staticmethod def is_safe_to_fuse_nodes( nodes_to_remove: list[onnx.NodeProto], diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py index a20d6dbffd7a7..42c4a11833641 100644 --- a/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py +++ b/onnxruntime/python/tools/quantization/fusions/fusion_gelu.py @@ -112,7 +112,9 @@ def fuse_1( return False self.nodes_to_remove.extend(subgraph_nodes) - fused_node = onnx.helper.make_node("Gelu", inputs=[subgraph_input], outputs=[subgraph_output]) + fused_node = onnx.helper.make_node( + "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[subgraph_output] + ) fused_node.domain = "com.microsoft" self.nodes_to_add.append(fused_node) return True @@ -173,11 +175,9 @@ def fuse_2( if not self.has_constant_input(sqrt_node, 2.0): return False - root_node = self.model.get_parent(div, 0, output_name_to_node) - if root_node is None: - return False + subgraph_input = div.input[0] - if root_node.output[0] not in mul.input: + if subgraph_input not in mul.input: return False subgraph_nodes = [div, erf_node, add_after_erf, mul_after_erf, mul] @@ -188,7 +188,9 @@ def fuse_2( return False self.nodes_to_remove.extend(subgraph_nodes) - fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[mul.output[0]]) + fused_node = onnx.helper.make_node( + "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[mul.output[0]] + ) fused_node.domain = "com.microsoft" self.nodes_to_add.append(fused_node) return True @@ -239,9 +241,8 @@ def fuse_3( if i < 0: return False - root_node = self.model.get_parent(first_mul, 0 if i == 1 else 1, output_name_to_node) - if root_node is None: - return False + root_input_index = 1 - i + subgraph_input = first_mul.input[root_input_index] if mul_half.output[0] not in input_name_to_nodes: return False @@ -250,7 +251,7 @@ def fuse_3( return False last_mul = children[0] - if not (last_mul.input[0] == root_node.output[0] or last_mul.input[1] == root_node.output[0]): + if not (last_mul.input[0] == subgraph_input or last_mul.input[1] == subgraph_input): return False subgraph_nodes = [first_mul, erf_node, add_after_erf, mul_half, last_mul] @@ -263,7 +264,9 @@ def fuse_3( return False self.nodes_to_remove.extend(subgraph_nodes) - fused_node = onnx.helper.make_node("Gelu", inputs=[root_node.output[0]], outputs=[last_mul.output[0]]) + fused_node = onnx.helper.make_node( + "Gelu", name=self.create_unique_node_name(), inputs=[subgraph_input], outputs=[last_mul.output[0]] + ) fused_node.domain = "com.microsoft" self.nodes_to_add.append(fused_node) return True diff --git a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py index d7fb89236d3d2..7d58c1c180822 100644 --- a/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py +++ b/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py @@ -127,6 +127,7 @@ def fuse( normalize_node = onnx.helper.make_node( "LayerNormalization", + name=self.create_unique_node_name(), inputs=[reduce_mean_node.input[0], weight_input, bias_input], outputs=[last_add_node.output[0]], ) diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py index 4591c9c950e6e..46d245d353a07 100644 --- a/onnxruntime/python/tools/quantization/onnx_model.py +++ b/onnxruntime/python/tools/quantization/onnx_model.py @@ -283,6 +283,23 @@ def find_node_by_name(self, node_name, new_nodes_list, graph): node = find_by_name(node_name, graph_nodes_list) return node + def get_largest_node_name_suffix(self, node_name_prefix): + """ + Gets the largest node name (int) suffix for all node names that begin with `node_name_prefix`. + Example: for nodes my_prefix_0 and my_prefix_3, this method returns 3. + """ + suffix = -1 + + for node in self.model.graph.node: + if node.name and node.name.startswith(node_name_prefix): + try: + index = int(node.name[len(node_name_prefix) :]) + suffix = max(index, suffix) + except ValueError: + continue + + return suffix + def find_nodes_by_initializer(self, graph, initializer): """ Find all nodes with given initializer as an input. diff --git a/onnxruntime/test/python/quantization/test_fusions.py b/onnxruntime/test/python/quantization/test_fusions.py new file mode 100644 index 0000000000000..bea110e566fb9 --- /dev/null +++ b/onnxruntime/test/python/quantization/test_fusions.py @@ -0,0 +1,401 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +import math +import unittest + +import numpy as np +import onnx + +import onnxruntime +from onnxruntime.quantization.execution_providers.qnn.fusion_lpnorm import FusionLpNormalization +from onnxruntime.quantization.fusions import FusionGelu, FusionLayerNormalization +from onnxruntime.quantization.onnx_model import ONNXModel + + +class TestFusions(unittest.TestCase): + def check_fused_model_correctness(self, orig_model, fused_model, inputs, rtol=1e-7, atol=0): + """ + Checks that the output of the fused model matches the output of the original model. + """ + orig_session = onnxruntime.InferenceSession(orig_model.SerializeToString(), providers=["CPUExecutionProvider"]) + orig_results = orig_session.run(None, inputs) + + fused_session = onnxruntime.InferenceSession( + fused_model.SerializeToString(), providers=["CPUExecutionProvider"] + ) + fused_results = fused_session.run([], inputs) + + self.assertEqual(len(orig_results), len(fused_results), "Number of outputs for fused model differs") + for idx, expected_output in enumerate(orig_results): + actual_output = fused_results[idx] + np.testing.assert_allclose( + expected_output, + actual_output, + rtol=rtol, + atol=atol, + err_msg=f"Fused model output {idx} differs", + ) + + def build_erf_sequence_1_model(self, shape): + """ + Erf sequence that fuses into Gelu: + +-------Mul(0.5)---------------------+ + | | + | v + [root] --> Div -----> Erf --> Add --> Mul --> + (B=1.4142...) (1) + + This method builds 2 of these Erf sequences: + + [root] -> ERF_SEQUENCE1 -> ERF_SEQUENCE2 -> output + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const") + half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const") + root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const") + + # First Erf sequence + mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["mul0_out"]) + div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"]) + erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"]) + add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"]) + mul1_node = onnx.helper.make_node("Mul", ["add_out", "mul0_out"], ["seq1_output"]) + + # Second Erf sequence + mul0_node_dup = onnx.helper.make_node("Mul", ["seq1_output", "half_const"], ["mul0_out_dup"]) + div_node_dup = onnx.helper.make_node("Div", ["seq1_output", "root2_const"], ["div_out_dup"]) + erf_node_dup = onnx.helper.make_node("Erf", ["div_out_dup"], ["erf_out_dup"]) + add_node_dup = onnx.helper.make_node("Add", ["erf_out_dup", "one_const"], ["add_out_dup"]) + mul1_node_dup = onnx.helper.make_node("Mul", ["add_out_dup", "mul0_out_dup"], ["output"]) + + graph = onnx.helper.make_graph( + [ + mul0_node, + div_node, + erf_node, + add_node, + mul1_node, + mul0_node_dup, + div_node_dup, + erf_node_dup, + add_node_dup, + mul1_node_dup, + ], + "two_erf_sequences", + [root_inp], + [output], + initializer=[one_const, half_const, root2_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + onnx.helper.make_opsetid("com.microsoft", 1), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def build_erf_sequence_2_model(self, shape): + """ + +------------------------------------+ + | | + | v + [root] --> Div -----> Erf --> Add --> Mul -->Mul --> + (B=1.4142...) (1) (0.5) + + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const") + half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const") + root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const") + + div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"]) + erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"]) + add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"]) + mul0_node = onnx.helper.make_node("Mul", ["add_out", "root"], ["mul0_out"]) + mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "half_const"], ["output"]) + + graph = onnx.helper.make_graph( + [div_node, erf_node, add_node, mul0_node, mul1_node], + "erf_sequence_2", + [root_inp], + [output], + initializer=[one_const, half_const, root2_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + onnx.helper.make_opsetid("com.microsoft", 1), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def build_erf_sequence_3_model(self, shape): + """ + +------------------------------------------+ + | | + | v + [root] --> Div -----> Erf --> Add --> Mul -->Mul + (B=1.4142...) (A=1) (A=0.5) + + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const") + half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const") + root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const") + + div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["div_out"]) + erf_node = onnx.helper.make_node("Erf", ["div_out"], ["erf_out"]) + add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"]) + mul0_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul0_out"]) + mul1_node = onnx.helper.make_node("Mul", ["mul0_out", "root"], ["output"]) + + graph = onnx.helper.make_graph( + [div_node, erf_node, add_node, mul0_node, mul1_node], + "erf_sequence_3", + [root_inp], + [output], + initializer=[one_const, half_const, root2_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + onnx.helper.make_opsetid("com.microsoft", 1), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def build_erf_sequence_4_model(self, shape): + """ + +----------------------------------------------+ + | | + | v + [root] --> Mul -----> Erf --> Add --> Mul -->Mul + (A=0.7071067690849304) (B=1) (B=0.5) + + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const") + half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const") + frac_const = onnx.numpy_helper.from_array(np.array(0.7071067690849304, dtype=np.float32), "frac_const") + + mul0_node = onnx.helper.make_node("Mul", ["root", "frac_const"], ["mul0_out"]) + erf_node = onnx.helper.make_node("Erf", ["mul0_out"], ["erf_out"]) + add_node = onnx.helper.make_node("Add", ["erf_out", "one_const"], ["add_out"]) + mul1_node = onnx.helper.make_node("Mul", ["add_out", "half_const"], ["mul1_out"]) + mul2_node = onnx.helper.make_node("Mul", ["mul1_out", "root"], ["output"]) + + graph = onnx.helper.make_graph( + [mul0_node, erf_node, add_node, mul1_node, mul2_node], + "erf_sequence_4", + [root_inp], + [output], + initializer=[one_const, half_const, frac_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + onnx.helper.make_opsetid("com.microsoft", 1), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def build_reduce_mean_sequence_model(self, shape, scale_val, bias_val, axis=-1): + """ + +----------------------+ + | | + | v + [Root] --> ReduceMean --> Sub --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add + (axis=2 or -1) | (Y=2) (axis=2 or -1) (E-6 or E-12 or 0) ^ ^ ^ + | | | | + +-------------------------------------------------+ [Scale] [Bias] + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const") + bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const") + axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const") + two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const") + eps_const = onnx.numpy_helper.from_array(np.array(1.0e-8, dtype=np.float32), "eps_const") + + rm0_node = onnx.helper.make_node("ReduceMean", ["root", "axes_const"], ["rm0_out"]) + sub_node = onnx.helper.make_node("Sub", ["root", "rm0_out"], ["sub_out"]) + pow_node = onnx.helper.make_node("Pow", ["sub_out", "two_const"], ["pow_out"]) + rm1_node = onnx.helper.make_node("ReduceMean", ["pow_out", "axes_const"], ["rm1_out"]) + add0_node = onnx.helper.make_node("Add", ["rm1_out", "eps_const"], ["add0_out"]) + sqrt_node = onnx.helper.make_node("Sqrt", ["add0_out"], ["sqrt_out"]) + div_node = onnx.helper.make_node("Div", ["sub_out", "sqrt_out"], ["div_out"]) + mul_node = onnx.helper.make_node("Mul", ["div_out", "scale_const"], ["mul_out"]) + add1_node = onnx.helper.make_node("Add", ["mul_out", "bias_const"], ["output"]) + + graph = onnx.helper.make_graph( + [rm0_node, sub_node, pow_node, rm1_node, add0_node, sqrt_node, div_node, mul_node, add1_node], + "reduce_mean_sequence", + [root_inp], + [output], + initializer=[scale_const, bias_const, axes_const, two_const, eps_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def build_reduce_l2_sequence_model(self, shape, epsilon_val, axis=-1): + """ + [root] --> ReduceL2 -----> Clip --> Expand ----> Div --> + | (axis=-1) (min=epsilon) (shape=root) ^ + | (keepdims=True) | + | | + +-----------------------------------------------+ + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + axes_const = onnx.numpy_helper.from_array(np.array([axis], dtype=np.int64), "axes_const") + eps_const = onnx.numpy_helper.from_array(np.array(epsilon_val, dtype=np.float32), "eps_const") + shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const") + + rl2_node = onnx.helper.make_node("ReduceL2", ["root", "axes_const"], ["rl2_out"], keepdims=1) + clip_node = onnx.helper.make_node("Clip", ["rl2_out", "eps_const"], ["clip_out"]) + expand_node = onnx.helper.make_node("Expand", ["clip_out", "shape_const"], ["expand_out"]) + div_node = onnx.helper.make_node("Div", ["root", "expand_out"], ["output"]) + + graph = onnx.helper.make_graph( + [rl2_node, clip_node, expand_node, div_node], + "reducel2_sequence", + [root_inp], + [output], + initializer=[axes_const, eps_const, shape_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return ONNXModel(model) + + def test_fuse_erf_to_gelu_1(self): + shape = (1, 2, 3) + model = self.build_erf_sequence_1_model(shape) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 2 Gelu nodes. + modified = FusionGelu(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 2) + + gelu_node_0 = model.model.graph.node[0] + gelu_node_1 = model.model.graph.node[1] + self.assertEqual(gelu_node_0.op_type, "Gelu") + self.assertEqual(gelu_node_1.op_type, "Gelu") + + self.assertTrue(gelu_node_0.name) + self.assertTrue(gelu_node_1.name) + self.assertNotEqual(gelu_node_0.name, gelu_node_1.name) # Generated names should not be equal + + # Check that fusion is equivalent to original Erf model. + inputs = {"root": np.ones(shape, dtype=np.float32)} + self.check_fused_model_correctness(orig_model, model.model, inputs) + + def test_fuse_erf_to_gelu_2(self): + shape = (1, 2, 3) + model = self.build_erf_sequence_2_model(shape) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 1 Gelu node. + modified = FusionGelu(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 1) + + gelu_node = model.model.graph.node[0] + self.assertEqual(gelu_node.op_type, "Gelu") + self.assertTrue(gelu_node.name) + + # Check that fusion is equivalent to original Erf model. + inputs = {"root": np.ones(shape, dtype=np.float32)} + self.check_fused_model_correctness(orig_model, model.model, inputs) + + def test_fuse_erf_to_gelu_3(self): + shape = (1, 2, 3) + model = self.build_erf_sequence_3_model(shape) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 1 Gelu node. + modified = FusionGelu(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 1) + + gelu_node = model.model.graph.node[0] + self.assertEqual(gelu_node.op_type, "Gelu") + self.assertTrue(gelu_node.name) + + # Check that fusion is equivalent to original Erf model. + inputs = {"root": np.ones(shape, dtype=np.float32)} + self.check_fused_model_correctness(orig_model, model.model, inputs) + + def test_fuse_erf_to_gelu_4(self): + shape = (1, 2, 3) + model = self.build_erf_sequence_4_model(shape) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 1 Gelu node. + modified = FusionGelu(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 1) + + gelu_node = model.model.graph.node[0] + self.assertEqual(gelu_node.op_type, "Gelu") + self.assertTrue(gelu_node.name) + + # Check that fusion is equivalent to original Erf model. + inputs = {"root": np.ones(shape, dtype=np.float32)} + self.check_fused_model_correctness(orig_model, model.model, inputs) + + def test_fuse_reduce_l2_to_lpnorm(self): + shape = (1, 2, 3) + model = self.build_reduce_l2_sequence_model(shape, 1e-12, axis=-1) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 1 LpNormalization node. + modified = FusionLpNormalization(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 1) + + lpnorm_node = model.model.graph.node[0] + self.assertEqual(lpnorm_node.op_type, "LpNormalization") + self.assertTrue(lpnorm_node.name) + + # LpNorm's p attribute should be set to 2 + p_attr = next(attr for attr in lpnorm_node.attribute if attr.name == "p") + self.assertEqual(p_attr.i, 2) + + def test_fuse_reduce_mean_to_layer_norm(self): + shape = (1, 2, 3) + model = self.build_reduce_mean_sequence_model(shape, [2.0, 2.0, 2.0], [1.0, 1.0, 1.0], axis=-1) + orig_model = onnx.ModelProto() + orig_model.CopyFrom(model.model) + + # Check that fusion simplified model to 1 LayerNormalization node. + modified = FusionLayerNormalization(model).apply() + self.assertTrue(modified) + self.assertEqual(len(model.model.graph.node), 1) + + layer_norm_node = model.model.graph.node[0] + self.assertEqual(layer_norm_node.op_type, "LayerNormalization") + self.assertTrue(layer_norm_node.name) + + # Check that fused model is equivalent to original model. + inputs = {"root": np.ones(shape, dtype=np.float32)} + self.check_fused_model_correctness(orig_model, model.model, inputs) + + +if __name__ == "__main__": + unittest.main() From 3b46ab643944a3bcc9e4d9eb2c155ead0bad5cdb Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 28 Feb 2024 00:46:29 +0800 Subject: [PATCH 59/68] Re-add testing removed by mistake. (#19647) --- .../azure-pipelines/linux-ci-pipeline.yml | 42 ++++++++++++++++++- .../docker/scripts/manylinux/requirements.txt | 1 + 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml index a4bd24b4dd18b..02147c321fab3 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml @@ -115,6 +115,7 @@ stages: searchFolder: '$(Build.BinariesDirectory)' testRunTitle: 'Unit Test Run' condition: succeededOrFailed() + - job: Linux_Release timeoutInMinutes: 180 workspace: @@ -243,7 +244,46 @@ stages: ln -s /data/models $(Build.BinariesDirectory)/models displayName: link model dir - + - bash: | + mkdir -p $HOME/.onnx + docker run --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + onnxruntimecpubuild \ + /bin/bash -c " + set -ex; \ + pushd /onnxruntime_src/csharp; \ + dotnet restore /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln; \ + dotnet build /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release; \ + dotnet test /onnxruntime_src/csharp/OnnxRuntime.DesktopOnly.CSharp.sln -c Release -f net6.0 --no-build -l \"console;verbosity=normal\"; \ + popd + " + displayName: 'Dotnet build C# sln and Test' + + - bash: | + mkdir -p $HOME/.onnx + docker run --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + onnxruntimecpubuild \ + /bin/bash -c " + set -ex; \ + /bin/bash /onnxruntime_src/tools/scripts/python_test.sh /onnxruntime_src /build Release && \ + /bin/bash /onnxruntime_src/tools/scripts/symbolic_shape_infer_test.sh /build + " + displayName: 'Run Release tests and symbolic shape infer test' - task: PublishTestResults@2 displayName: 'Publish unit test results' diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 94f52f476579b..886f19388d01e 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -10,3 +10,4 @@ protobuf==4.21.12 sympy==1.12 flatbuffers neural-compressor>=2.2.1 +triton From 580ee20dfce2849029229eb213dc8c7c87a89483 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 28 Feb 2024 02:56:16 +1000 Subject: [PATCH 60/68] Tweak Windows build parallelization settings (#19664) ### Description Use UseMultiToolTask and limit the number of cl.exe instances running. MultiToolTask info: https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/ Info on why limiting CL_MPCount can help: https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows The current CIs have 4 cores (both physical and logical). Hardcoded the GPU build in win-ci.yml to use CL_MPCount of 2 as that seems to work fine. Can adjust if needed to base it on the actual number of cores or to use build.py to build. Caveat: I've run about 16 builds and haven't seen a slow build yet, but as the root cause of the slow builds isn't really known this isn't guaranteed to be a fix. ### Motivation and Context Try and prevent super slow GPU builds by reducing number of tasks potentially running in parallel. --- tools/ci_build/build.py | 15 ++++++++++++++- .../github/azure-pipelines/templates/win-ci.yml | 3 ++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 74c473d34f548..1056c4ed84510 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1451,6 +1451,13 @@ def generate_build_tree( # tools need to use the symbols. add_default_definition(cmake_extra_defines, "CMAKE_MSVC_DEBUG_INFORMATION_FORMAT", "ProgramDatabase") + if number_of_parallel_jobs(args) > 0: + # https://devblogs.microsoft.com/cppblog/improved-parallelism-in-msbuild/ + # NOTE: this disables /MP if set (according to comments on blog post). + # By default, MultiProcMaxCount and CL_MPCount value are equal to the number of CPU logical processors. + # See logic around setting CL_MPCount below + cmake_args += ["-DCMAKE_VS_GLOBALS=UseMultiToolTask=true;EnforceProcessCountAcrossBuilds=true"] + cmake_args += [f"-D{define}" for define in cmake_extra_defines] cmake_args += cmake_extra_args @@ -1662,11 +1669,17 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe build_tool_args = [] if num_parallel_jobs != 1: if is_windows() and args.cmake_generator != "Ninja" and not args.build_wasm: + # https://github.com/Microsoft/checkedc-clang/wiki/Parallel-builds-of-clang-on-Windows suggests + # not maxing out CL_MPCount + # Start by having one less than num_parallel_jobs (default is num logical cores), + # limited to a range of 1..3 + # that gives maxcpucount projects building using up to 3 cl.exe instances each build_tool_args += [ f"/maxcpucount:{num_parallel_jobs}", + # one less than num_parallel_jobs, at least 1, up to 3 + f"/p:CL_MPCount={min(max(num_parallel_jobs - 1, 1), 3)}", # if nodeReuse is true, msbuild processes will stay around for a bit after the build completes "/nodeReuse:False", - f"/p:CL_MPCount={num_parallel_jobs}", ] elif args.cmake_generator == "Xcode": build_tool_args += [ diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 8ed22153fd947..e32956d6eb913 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -162,10 +162,11 @@ stages: platform: ${{ parameters.msbuildPlatform }} configuration: RelWithDebInfo msbuildArchitecture: ${{ parameters.buildArch }} - maximumCpuCount: true + maximumCpuCount: true # default is num logical cores worth of projects building concurrently logProjectEvents: true workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo' createLogFile: true + msbuildArgs: "/p:CL_MPCount=2" # 2x cl.exe per project building. - task: PythonScript@0 displayName: 'test' From 1c468a03b90aa8122d49b3148152a67b0519d36e Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 28 Feb 2024 03:27:43 +1000 Subject: [PATCH 61/68] Improve Nuget-CUDA-Packaging-Pipeline (#19668) ### Description * Publish the artifacts as late as possible * once published the artifacts are immutable, and any retry will fail if they exist * if any step fails after publishing the stage cannot be retried * use powershell to cleanup * DeleteFiles is taking >30 mins and causing the stage to timeout * powershell took < 1s ### Motivation and Context Make pipeline more robust --- .../stages/nuget-combine-cuda-stage.yml | 13 ++++++------- ...mponent-governance-component-detection-steps.yml | 7 ++----- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml index 8ca3d9148b514..064e2ea91d194 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml @@ -213,13 +213,6 @@ stages: PlatformsSupported: 'linux-x64' VerifyNugetSigning: false - - task: PublishPipelineArtifact@0 - displayName: 'Publish Pipeline NuGet Artifact' - inputs: - artifactName: 'drop-signed-nuget-GPU' - targetPath: '$(Build.ArtifactStagingDirectory)' - - - task: MSBuild@1 displayName: 'Clean C#' inputs: @@ -241,6 +234,12 @@ stages: parameters: condition: 'succeeded' + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline NuGet Artifact' + inputs: + artifactName: 'drop-signed-nuget-GPU' + targetPath: '$(Build.ArtifactStagingDirectory)' + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 displayName: 'Clean Agent Directories' condition: always() diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml index f1418e75bffa2..3d128fdb78eee 100644 --- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml @@ -6,11 +6,8 @@ parameters: steps: - ${{ if eq(variables['System.TeamProject'], 'Lotus') }}: - - task: DeleteFiles@1 - inputs: - SourceFolder: '$(Build.BinariesDirectory)' - contents: | - **/* + - powershell: | + Remove-Item $(Build.BinariesDirectory)/* -Recurse -Force displayName: 'Clean up build directory' - task: ms.vss-governance-buildtask.governance-build-task-component-detection.ComponentGovernanceComponentDetection@0 From 2e4d1b8f1ba928fe5879077eced9cd5191760cfb Mon Sep 17 00:00:00 2001 From: zesongw Date: Wed, 28 Feb 2024 02:01:12 +0800 Subject: [PATCH 62/68] [WebNN EP] Add support for Op MatMul of WebNN CPU backend (#19413) Enable MatMul support for WebNN CPU backend to support more models. --- onnxruntime/core/providers/webnn/builders/helper.h | 2 +- .../webnn/builders/impl/gemm_op_builder.cc | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index d94729e60d029..d7892fe02c1ba 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -195,7 +195,7 @@ static const InlinedHashMap op_map = { {"LessOrEqual", {"lesserOrEqual", false}}, {"Log", {"log", false}}, {"LpPool", {"l2Pool2d", false}}, - {"MatMul", {"matmul", false}}, + {"MatMul", {"matmul", true}}, {"MatMulInteger", {"matmulInteger", false}}, {"Max", {"max", true}}, {"MaxPool", {"maxPool2d", true}}, diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 4bf991a1b0105..d5f84f853f7de 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -29,7 +29,7 @@ class GemmOpBuilder : public BaseOpBuilder { // Add operator related. Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, - const logging::Logger& /* logger */) const { + const logging::Logger& logger) const { const auto& op_type = node.OpType(); const auto& input_defs = node.InputDefs(); const size_t a_idx = 0, b_idx = 1, c_idx = 2; // A*B+C @@ -38,7 +38,17 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name()); emscripten::val output = emscripten::val::object(); if (op_type == "MatMul") { - output = model_builder.GetBuilder().call("matmul", a, b); + std::vector a_shape; + if (!GetShape(*input_defs[a_idx], a_shape, logger)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A."); + } + // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case. + // TODO: Remove this workaround when it is fixed in Chromium. + if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) { + output = model_builder.GetBuilder().call("gemm", a, b); + } else { + output = model_builder.GetBuilder().call("matmul", a, b); + } } else if (op_type == "MatMulInteger") { emscripten::val a_zero_point = emscripten::val::null(); emscripten::val b_zero_point = emscripten::val::null(); From 3cb81cdde25d059af5674506f6a5b899c9c0f5ee Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 27 Feb 2024 11:07:15 -0800 Subject: [PATCH 63/68] [js/common] move 'env.wasm.trace' to 'env.trace' (#19617) ### Description Try to move 'env.wasm.trace' to 'env.trace' to make it less confusing, because it also works in webgpu. Marked 'env.wasm.trace' as deprecated. --- js/common/lib/env.ts | 9 +++++++++ js/common/lib/trace.ts | 6 +++--- js/web/lib/wasm/jsep/backend-webgpu.ts | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index 6299c26159400..73a47d1a4f937 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -36,6 +36,7 @@ export declare namespace Env { /** * set or get a boolean value indicating whether to enable trace. * + * @deprecated Use `env.trace` instead. If `env.trace` is set, this property will be ignored. * @defaultValue `false` */ trace?: boolean; @@ -167,6 +168,7 @@ export interface Env { * @defaultValue `'warning'` */ logLevel?: 'verbose'|'info'|'warning'|'error'|'fatal'; + /** * Indicate whether run in debug mode. * @@ -174,6 +176,13 @@ export interface Env { */ debug?: boolean; + /** + * set or get a boolean value indicating whether to enable trace. + * + * @defaultValue `false` + */ + trace?: boolean; + /** * Get version of the current package. */ diff --git a/js/common/lib/trace.ts b/js/common/lib/trace.ts index 404f7ef8089af..7e0487b350198 100644 --- a/js/common/lib/trace.ts +++ b/js/common/lib/trace.ts @@ -4,7 +4,7 @@ import {env} from './env-impl.js'; export const TRACE = (deviceType: string, label: string) => { - if (!env.wasm.trace) { + if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; } // eslint-disable-next-line no-console @@ -30,14 +30,14 @@ const TRACE_FUNC = (msg: string, extraMsg?: string) => { }; export const TRACE_FUNC_BEGIN = (extraMsg?: string) => { - if (!env.wasm.trace) { + if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; } TRACE_FUNC('BEGIN', extraMsg); }; export const TRACE_FUNC_END = (extraMsg?: string) => { - if (!env.wasm.trace) { + if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; } TRACE_FUNC('END', extraMsg); diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts index 3e3a191ec3ead..27c5566ab9fed 100644 --- a/js/web/lib/wasm/jsep/backend-webgpu.ts +++ b/js/web/lib/wasm/jsep/backend-webgpu.ts @@ -710,7 +710,8 @@ export class WebGpuBackend { } setQueryType(): void { this.queryType = 'none'; - if (this.env.webgpu.profiling?.mode === 'default' || this.env.wasm.trace) { + if (this.env.webgpu.profiling?.mode === 'default' || + (typeof this.env.trace === 'undefined' ? this.env.wasm.trace : this.env.trace)) { if (this.device.features.has('chromium-experimental-timestamp-query-inside-passes')) { this.queryType = 'inside-passes'; } else if (this.device.features.has('timestamp-query')) { From c20ced4132d111e3e63844e292f0d8e318cffab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20M=C3=BCller?= <44298237+gedoensmax@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:26:48 +0100 Subject: [PATCH 64/68] Use CMake's find package for CUDA libs (#19673) ### Description Answers issue #19640 More details are in the issue, basically I am changing all the include directory and link directory usage to CMake's `CUDA::*` targets --- cmake/CMakeLists.txt | 4 ++++ cmake/adjust_global_compile_flags.cmake | 2 +- .../external/onnxruntime_external_deps.cmake | 3 +-- cmake/onnxruntime_providers_cuda.cmake | 20 +++++++++---------- cmake/onnxruntime_providers_tensorrt.cmake | 11 +++++----- cmake/onnxruntime_python.cmake | 5 +---- cmake/onnxruntime_unittests.cmake | 4 ++-- .../core/providers/cuda/nvtx_profile.cc | 5 ----- 8 files changed, 25 insertions(+), 29 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 1376c90fbcefe..8453da19ce3a6 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1412,6 +1412,10 @@ endif() if (onnxruntime_USE_CUDA) set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) set(CMAKE_CUDA_STANDARD 17) + if(onnxruntime_CUDA_HOME) + file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME}) + endif() + find_package(CUDAToolkit REQUIRED) if(onnxruntime_CUDNN_HOME) file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) endif() diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index 8161ea574b8cc..d3f9256105127 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -205,7 +205,7 @@ endif() macro(check_nvcc_compiler_flag _FLAG _RESULT) - execute_process(COMMAND ${onnxruntime_CUDA_HOME}/bin/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR) + execute_process(COMMAND ${CUDAToolkit_BIN_DIR}/nvcc "${_FLAG}" RESULT_VARIABLE NVCC_OUT ERROR_VARIABLE NVCC_ERROR) message("NVCC_ERROR = ${NVCC_ERROR}") message("NVCC_OUT = ${NVCC_OUT}") if ("${NVCC_OUT}" MATCHES "0") diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 22d12b128dc1f..09d57164b4ee1 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -556,16 +556,15 @@ message("Finished fetching external dependencies") set(onnxruntime_LINK_DIRS ) if (onnxruntime_USE_CUDA) #TODO: combine onnxruntime_CUDNN_HOME and onnxruntime_CUDA_HOME, assume they are the same + find_package(CUDAToolkit REQUIRED) if (WIN32) if(onnxruntime_CUDNN_HOME) list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib/x64) endif() - list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64) else() if(onnxruntime_CUDNN_HOME) list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64) endif() - list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64) endif() endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 9887d615c92d7..0f6d48bdb6ec8 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -178,15 +178,16 @@ add_dependencies(${target} onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) if(onnxruntime_CUDA_MINIMAL) target_compile_definitions(${target} PRIVATE USE_CUDA_MINIMAL) - target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) + target_link_libraries(${target} PRIVATE ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface CUDA::cudart) else() - target_link_libraries(${target} PRIVATE cublasLt cublas cudnn curand cufft ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) + target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas cudnn CUDA::curand CUDA::cufft CUDA::cudart + ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface) if(onnxruntime_CUDNN_HOME) target_include_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/include) target_link_directories(${target} PRIVATE ${onnxruntime_CUDNN_HOME}/lib) endif() endif() - + if (onnxruntime_USE_TRITON_KERNEL) # compile triton kernel, generate .a and .h files include(onnxruntime_compile_triton_kernel.cmake) @@ -196,25 +197,24 @@ target_include_directories(${target} PRIVATE ${triton_kernel_header_dir}) target_link_libraries(${target} PUBLIC -Wl,--whole-archive ${triton_kernel_obj_file} -Wl,--no-whole-archive) # lib cuda needed by cuLaunchKernel - target_link_libraries(${target} PRIVATE cuda) + target_link_libraries(${target} PRIVATE CUDA::cuda_driver) endif() include(cutlass) target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples) - target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} + PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA) set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime") if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling - target_include_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/include) - target_link_directories(${target} PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64) - target_link_libraries(${target} PRIVATE cupti) + target_link_libraries(${target} PRIVATE CUDA::cupti) endif() - if (onnxruntime_ENABLE_NVTX_PROFILE AND NOT WIN32) - target_link_libraries(${target} PRIVATE nvToolsExt) + if (onnxruntime_ENABLE_NVTX_PROFILE) + target_link_libraries(${target} PRIVATE CUDA::nvtx3) endif() if (onnxruntime_ENABLE_TRAINING_OPS) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 686a993de3a4a..15ffc29e79ff4 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -8,7 +8,7 @@ set(BUILD_LIBRARY_ONLY 1) add_definitions("-DONNX_ML=1") add_definitions("-DONNX_NAMESPACE=onnx") - set(CUDA_INCLUDE_DIRS ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + set(CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) set(TENSORRT_ROOT ${onnxruntime_TENSORRT_HOME}) set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) set(PROTOBUF_LIBRARY ${PROTOBUF_LIB}) @@ -58,7 +58,7 @@ URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt} ) if (NOT CUDA_INCLUDE_DIR) - set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build + set(CUDA_INCLUDE_DIR ${CUDAToolkit_INCLUDE_DIRS}) # onnx-tensorrt repo needs this variable to build endif() # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose. @@ -102,11 +102,12 @@ onnxruntime_add_include_to_target(onnxruntime_providers_tensorrt onnxruntime_common onnx flatbuffers::flatbuffers Boost::mp11 safeint_interface) add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER) - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS}) + target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS} PUBLIC CUDA::cudart) else() - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS}) + target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS} PUBLIC CUDA::cudart) endif() - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} + PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) if(onnxruntime_CUDNN_HOME) target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include) endif() diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 3f20787e87425..23c6e5e430875 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -282,10 +282,7 @@ if (WIN32) get_filename_component(CUDNN_DLL_NAME ${CUDNN_DLL_PATH} NAME_WE) string(REPLACE "cudnn64_" "" CUDNN_VERSION "${CUDNN_DLL_NAME}") if(NOT onnxruntime_CUDA_VERSION) - message("Reading json file ${onnxruntime_CUDA_HOME}/version.json") - set(CUDA_SDK_JSON_FILE_PATH "${onnxruntime_CUDA_HOME}/version.json") - file(READ ${CUDA_SDK_JSON_FILE_PATH} CUDA_SDK_JSON_CONTENT) - string(JSON onnxruntime_CUDA_VERSION GET ${CUDA_SDK_JSON_CONTENT} "cuda" "version") + set(onnxruntime_CUDA_VERSION ${CUDAToolkit_VERSION}) message("onnxruntime_CUDA_VERSION=${onnxruntime_CUDA_VERSION}") endif() file(APPEND "${VERSION_INFO_FILE}" diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 3ed695327c183..88f662075e177 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -67,7 +67,7 @@ function(AddTest) if(onnxruntime_USE_CUDA) #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs, # otherwise it will impact when CUDA DLLs can be unloaded. - target_link_libraries(${_UT_TARGET} PRIVATE cudart) + target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart) endif() target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES}) endif() @@ -1268,7 +1268,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) list(APPEND onnxruntime_shared_lib_test_LIBS cpuinfo) endif() if (onnxruntime_USE_CUDA) - list(APPEND onnxruntime_shared_lib_test_LIBS cudart) + list(APPEND onnxruntime_shared_lib_test_LIBS CUDA::cudart) endif() if (onnxruntime_USE_ROCM) list(APPEND onnxruntime_shared_lib_test_LIBS hip::host) diff --git a/onnxruntime/core/providers/cuda/nvtx_profile.cc b/onnxruntime/core/providers/cuda/nvtx_profile.cc index 6c7c594066b86..867e7c1f24584 100644 --- a/onnxruntime/core/providers/cuda/nvtx_profile.cc +++ b/onnxruntime/core/providers/cuda/nvtx_profile.cc @@ -4,13 +4,8 @@ #ifdef ENABLE_NVTX_PROFILE #include "nvtx_profile.h" #include "core/common/common.h" -#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__) #include #include -#else -#include -#include -#endif namespace onnxruntime { namespace profile { From f95c0773a129a4605b2161f5f9fddb8116c948d0 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 28 Feb 2024 10:40:40 +0800 Subject: [PATCH 65/68] Add share memory Flag in docker (#19672) ### Description ### Motivation and Context Ref: https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#setincshmem Co-authored-by: Your Name --- tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 822bc559d992d..165bd804a8ad5 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -241,7 +241,7 @@ stages: script: | set -e -x mkdir -p $HOME/.onnx - docker run --gpus all --rm \ + docker run --gpus all --shm-size=1g --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --rm \ --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory)/Release:/build/Release \ --volume /data/models:/build/models:ro \ From 026e3178ae71cfcc5cfa2decde9a7d64b935d255 Mon Sep 17 00:00:00 2001 From: pengwa Date: Wed, 28 Feb 2024 15:57:05 +0800 Subject: [PATCH 66/68] Improve memory matrix for ORTModule (#19620) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Memory matrix for ORTModule Collect parameter/gradient/buffers sizes also. Exposed as a function, can be used externally for debugging purpose. ``` 2024-02-27 07:18:55,283 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,322 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,358 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 816 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,438 orttraining.rank-0 [INFO] - rank-0 step 1 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▏ | 2/3200 [01:27<32:05:11, 36.12s/it]2024-02-27 07:18:55,498 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,537 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,576 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,657 orttraining.rank-0 [INFO] - rank-0 step 2 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▏ | 3/3200 [01:27<17:30:57, 19.72s/it]2024-02-27 07:18:55,711 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,750 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,786 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,867 orttraining.rank-0 [INFO] - rank-0 step 3 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 [2024-02-27 07:18:55,886] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 0%|▎ | 4/3200 [01:28<10:39:52, 12.01s/it]2024-02-27 07:18:55,902 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,944 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:55,979 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,060 orttraining.rank-0 [INFO] - rank-0 step 4 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▍ | 5/3200 [01:28<6:53:04, 7.76s/it]2024-02-27 07:18:56,115 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,154 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,190 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,270 orttraining.rank-0 [INFO] - rank-0 step 5 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▍ | 6/3200 [01:28<4:36:19, 5.19s/it]2024-02-27 07:18:56,323 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,365 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,398 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,478 orttraining.rank-0 [INFO] - rank-0 step 6 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▌ | 7/3200 [01:28<3:09:33, 3.56s/it]2024-02-27 07:18:56,533 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,572 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,608 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,727 orttraining.rank-0 [INFO] - rank-0 step 7 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▌ | 8/3200 [01:28<2:13:48, 2.52s/it]2024-02-27 07:18:56,806 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,846 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,882 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: pre_backward | allocated: 8926 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:56,962 orttraining.rank-0 [INFO] - rank-0 step 8 memory (MiB) | phase: post_backward | allocated: 6098 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 218 | max inactive: 831 | param: 5314 | grad: 12 | buffer: 8 0%|▋ | 9/3200 [01:29<1:36:03, 1.81s/it]2024-02-27 07:18:57,053 orttraining.rank-0 [INFO] - rank-0 step 9 memory (MiB) | phase: pre_forward | allocated: 5331 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 219 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 2024-02-27 07:18:57,094 orttraining.rank-0 [INFO] - rank-0 step 9 memory (MiB) | phase: post_forward | allocated: 8162 | max allocated: 9039 | cached: 9382 | max cached: 9382 | inactive: 400 | max inactive: 831 | param: 5314 | grad: 0 | buffer: 8 ``` --- .../training/ortmodule/_runtime_inspector.py | 37 +++------ .../python/training/utils/__init__.py | 2 + .../training/utils/torch_profile_utils.py | 76 +++++++++++++++++++ 3 files changed, 88 insertions(+), 27 deletions(-) diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py index 078ce4d27cd6f..772b9bd9e31ae 100644 --- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py +++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py @@ -14,7 +14,7 @@ from sympy import Symbol, simplify from sympy.parsing.sympy_parser import parse_expr -from onnxruntime.training.utils import PTable +from onnxruntime.training.utils import PTable, log_memory_usage from ._execution_agent import TrainingAgent from .options import _MemoryOptimizationLevel, _RuntimeOptions @@ -509,6 +509,8 @@ def __init__(self, m: torch.nn.Module, logger: Logger): self._is_first_inspect = True + self._m = m + def is_enabled(self) -> bool: """Check if memory inspector is enabled.""" return self._is_enabled @@ -621,29 +623,13 @@ def inspect_memory(self, cur_phase: Phase): need_print = self._current_step < 10 or (self._current_step & (self._current_step - 1) == 0) if need_print: - cur_mem_allocated = self._normalize(torch.cuda.memory_allocated()) - max_mem_allocated = self._normalize(torch.cuda.max_memory_allocated()) - cur_mem_cached = self._normalize(torch.cuda.memory_reserved()) - max_mem_cached = self._normalize(torch.cuda.max_memory_reserved()) - torch_mem_stat = torch.cuda.memory_stats() - cur_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0)) - max_mem_inactive = self._normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0)) - - mem_stats = [ - ["phase", _convert_phase_to_string(cur_phase)], - ["allocated", cur_mem_allocated], # current memory allocated for tensors - ["max allocated", max_mem_allocated], # peak memory allocated for tensors - ["cached", cur_mem_cached], # current memory cached for the caching allocator - ["max cached", max_mem_cached], # peak memory cached for caching allocator. - ["inactive", cur_mem_inactive], # amount of inactive, non-releasable memory - ["max inactive", max_mem_inactive], # peak of inactive, non-releasable memory - ] - - summ = f"{self._rank_info} step {self._current_step} memory ({MemoryObserver.NORMALIZER_UNIT})" - for stat in mem_stats: - summ += f" | {stat[0]}: {stat[1]}" - - self._logger.info(summ) + log_memory_usage( + _convert_phase_to_string(cur_phase), + rank_0_only=True, + step_info=f"step {self._current_step}", + logger=self._logger, + module=self._m, + ) if cur_phase == self._last_phase: self._increase_step() @@ -655,9 +641,6 @@ def inspect_memory(self, cur_phase: Phase): def _increase_step(self): self._current_step += 1 - def _normalize(self, mem_size_in_bytes: Union[float, int]) -> str: - return f"{float(mem_size_in_bytes) / MemoryObserver.NORMALIZER_FACTOR:.0f}" - def display_memory_optimization_plans(self, memory_optimizer_config, details=False) -> Tuple[List[str], PTable]: mem_plan_count = len(self.cluster_id_combination_to_saving_symbolics_map) diff --git a/orttraining/orttraining/python/training/utils/__init__.py b/orttraining/orttraining/python/training/utils/__init__.py index b4a518d573998..ecfb7d7907f3c 100644 --- a/orttraining/orttraining/python/training/utils/__init__.py +++ b/orttraining/orttraining/python/training/utils/__init__.py @@ -12,6 +12,7 @@ unflatten_data_using_schema, ) from onnxruntime.training.utils.torch_profile_utils import ( + log_memory_usage, nvtx_function_decorator, torch_nvtx_range_pop, torch_nvtx_range_push, @@ -31,6 +32,7 @@ "torch_nvtx_range_push", "torch_nvtx_range_pop", "nvtx_function_decorator", + "log_memory_usage", "pytorch_type_to_onnx_dtype", "onnx_dtype_to_pytorch_dtype", "pytorch_scalar_type_to_pytorch_dtype", diff --git a/orttraining/orttraining/python/training/utils/torch_profile_utils.py b/orttraining/orttraining/python/training/utils/torch_profile_utils.py index 382d7dac142fe..9e8a41e0dc7c8 100644 --- a/orttraining/orttraining/python/training/utils/torch_profile_utils.py +++ b/orttraining/orttraining/python/training/utils/torch_profile_utils.py @@ -3,6 +3,8 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- +from __future__ import annotations + import torch @@ -26,3 +28,77 @@ def wrapped_fn(*args, **kwargs): return ret_val return wrapped_fn + + +def log_memory_usage(cur_phase: str, rank_0_only=True, step_info="", logger=None, module=None): + """Log memory usage for the current phase. + Args: + cur_phase (str): The current phase. + rank_0_only (bool, optional): Only log the memory usage for rank 0. Defaults to True. + step_info (str, optional): The step information. Defaults to "". + logger (logging.Logger, optional): The logger to log the memory usage. Defaults to None, which means print to stdout. + module (torch.nn.Module, optional): The module to get parameter, buffer and grad sizes. Defaults to None. + """ + rank = 0 + if rank_0_only is True: + if torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + if rank != 0: + return + + _normalizer_factor = float(1024 * 1024) + _normalizer_unit = "MiB" + + def _normalize(mem_size_in_bytes: float | int) -> str: + return f"{float(mem_size_in_bytes) / _normalizer_factor:.0f}" + + cur_mem_allocated = _normalize(torch.cuda.memory_allocated()) + max_mem_allocated = _normalize(torch.cuda.max_memory_allocated()) + cur_mem_cached = _normalize(torch.cuda.memory_reserved()) + max_mem_cached = _normalize(torch.cuda.max_memory_reserved()) + torch_mem_stat = torch.cuda.memory_stats() + cur_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.current", 0)) + max_mem_inactive = _normalize(torch_mem_stat.get("inactive_split_bytes.all.peak", 0)) + + mem_stats = [ + ["phase", cur_phase], + ["allocated", cur_mem_allocated], # current memory allocated for tensors + ["max allocated", max_mem_allocated], # peak memory allocated for tensors + ["cached", cur_mem_cached], # current memory cached for the caching allocator + ["max cached", max_mem_cached], # peak memory cached for caching allocator. + ["inactive", cur_mem_inactive], # amount of inactive, non-releasable memory + ["max inactive", max_mem_inactive], # peak of inactive, non-releasable memory + ] + + # Calculate the total size of parameters and gradients in the model + if module: + param_total_size = 0 + grad_total_size = 0 + for p in module.parameters(): + if p.is_cuda: + param_total_size += p.numel() * p.element_size() + if p.grad is not None and p.grad.is_cuda: + grad_total_size += p.grad.numel() * p.grad.element_size() + + # Calculate the total size of buffers in the model + buffer_total_size = 0 + for b in module.buffers(): + if b.is_cuda: + buffer_total_size += b.numel() * b.element_size() + + mem_stats.extend( + [ + ["param", _normalize(param_total_size)], + ["grad", _normalize(grad_total_size)], + ["buffer", _normalize(buffer_total_size)], + ] + ) + + summ = f"rank-{rank} {step_info} memory ({_normalizer_unit})" + for stat in mem_stats: + summ += f" | {stat[0]}: {stat[1]}" + + if logger is None: + print(summ) + else: + logger.info(summ) From 7a147fc6f76a30b8d5875352afced515431ec7e5 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 28 Feb 2024 02:20:53 -0800 Subject: [PATCH 67/68] Remove a bash task from webgpu CI pipeline (#19682) ### Description It is a "Bash" task that requires running bash on Windows. Most Windows operating systems do not have Bash installed. Given this task is only debugging purposes, we can remove it for now. ### Motivation and Context I am making this change because I am regenerating the VM image in a different manner, and the new image does not contain bash. Once this PR is in, I can switch the images. --- .../github/azure-pipelines/templates/win-web-ci.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml index 8ba3517530edd..043da233cc674 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml @@ -155,12 +155,7 @@ jobs: path: $(Build.SourcesDirectory)/js/test/ cacheHitVar: CACHE_RESTORED displayName: 'Cache ONNX node test data' - - task: Bash@3 - inputs: - targetType: 'inline' - script: find "$(Build.SourcesDirectory)/js/test/" -type f - condition: and(not(canceled()), eq(variables.CACHE_RESTORED, 'true')) - displayName: 'List ONNX node test data' + - task: PowerShell@2 inputs: filePath: '$(Build.SourcesDirectory)\tools\ci_build\github\js\pack-npm-packages.ps1' From 913bdc7306e11b65644f733861684a3a460e8db0 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 28 Feb 2024 08:30:12 -0800 Subject: [PATCH 68/68] [QNN Quant] Handle external data for QNN preprocessing/quant (#19670) ### Description - Adds parameters to `qnn_preprocess_model()` to allow saving the new model with external data. - Updates `get_qnn_qdq_config()` to: - Load model without external data (it is not needed) - Return a quantization configuration with `use_external_data_format` set to `True` if the model has external data or if the model is >= 2GB. ### Motivation and Context Update QNN quantization to better handle large models that use external data. --- .../execution_providers/qnn/preprocess.py | 51 +++++- .../execution_providers/qnn/quant_config.py | 15 +- .../quantization/test_qnn_preprocess_model.py | 170 ++++++++++++++++++ .../test_tensor_quant_overrides_option.py | 30 ++++ 4 files changed, 261 insertions(+), 5 deletions(-) create mode 100644 onnxruntime/test/python/quantization/test_qnn_preprocess_model.py diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py index b1c114fe1f9fd..b0dab81830c8b 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py @@ -3,6 +3,8 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +from __future__ import annotations + import logging from pathlib import Path @@ -13,7 +15,44 @@ from .fusion_lpnorm import FusionLpNormalization -def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm: bool = False) -> bool: +def qnn_preprocess_model( + model_input: Path, + model_output: Path, + fuse_layernorm: bool = False, + save_as_external_data: bool = False, + all_tensors_to_one_file: bool = False, + external_data_location: str | None = None, + external_data_size_threshold: int = 1024, + external_data_convert_attribute: bool = False, +) -> bool: + """ + If necessary, this method creates a new "pre-processed" model in preparation for + quantization of a model to be used in QNN EP. Returns true if a new model was created. + + This method perfoms the following operations: + - Fuse Erf sequence into a single Gelu node. + - Fuse ReduceL2 sequence into a single LpNormalization node (p == 2). + - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node. + + Args: + model_input: Path to the input model file. + model_output: Path the output model file, which is only created if this method returns True. + fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes. + Defaults to False. + save_as_external_data: True if output model should be saved with external data. Defaults to false. + all_tensors_to_one_file: Effective only if save_as_external_data is true. Defaults to false. + If true, save all tensors to one external file specified by external_data_location. + If false, save each tensor to a file named with the tensor name. + external_data_location: Effective only if save_as_external_data is true. Defaults to None. + Specify the external file to which all tensors are saved. Path is relative + to the model path. If not specified, the model's name is used. + external_data_size_threshold: Effective only if save_as_external_data is true. Defaults to 1024. + Tensors with a data size >= external_data_size_threshold are converted to external data. + To convert every tensor with raw data to external data, set to 0. + external_data_convert_attribute: Effective only if save_as_external_data is true. Defaults to false. + If true, convert all tensors to external data. + If false, convert only non-attribute tensors to external data. + """ modified = False model = onnx.load_model(model_input) onnx_model = ONNXModel(model) @@ -57,6 +96,14 @@ def qnn_preprocess_model(model_input: Path, model_output: Path, fuse_layernorm: if modified: onnx_model.topological_sort() - onnx.save_model(model, model_output) + onnx.save_model( + model, + model_output, + save_as_external_data=save_as_external_data, + all_tensors_to_one_file=all_tensors_to_one_file, + location=external_data_location, + size_threshold=external_data_size_threshold, + convert_attribute=external_data_convert_attribute, + ) return modified diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py index 7c2fa4f65ae1b..e9affae7ac263 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py @@ -15,6 +15,7 @@ Q16_TYPES = {QuantType.QInt16, QuantType.QUInt16} Q8_TYPES = {QuantType.QInt8, QuantType.QUInt8} OP_TYPES_TO_EXCLUDE = {"Cast"} +MODEL_SIZE_THRESHOLD = 2147483648 # Quant model should use external data if >= 2GB def get_qnn_qdq_config( @@ -28,14 +29,21 @@ def get_qnn_qdq_config( if per_channel: raise ValueError("QNN EP does not yet support per-channel quantization.") - # Process model nodes to setup overrides. - model = onnx.load_model(model_input) + model = onnx.load_model(model_input, load_external_data=False) op_types = set() tensor_quant_overrides = {} + model_has_external_data = False + name_to_initializer = {} - name_to_initializer = {initializer.name: initializer for initializer in model.graph.initializer} + # Build map of initializers (name -> initializer) and + # check if the model has external data. + for initializer in model.graph.initializer: + name_to_initializer[initializer.name] = initializer + if onnx.external_data_helper.uses_external_data(initializer): + model_has_external_data = True + # Setup quantization overrides for specific operator types for node in model.graph.node: op_types.add(node.op_type) @@ -89,5 +97,6 @@ def get_qnn_qdq_config( activation_type=activation_type, weight_type=weight_type, op_types_to_quantize=list(op_types.difference(OP_TYPES_TO_EXCLUDE)), + use_external_data_format=(model_has_external_data or model.ByteSize() >= MODEL_SIZE_THRESHOLD), extra_options=extra_options, ) diff --git a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py new file mode 100644 index 0000000000000..9b67fd41caac3 --- /dev/null +++ b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +import math +import unittest +from pathlib import Path + +import numpy as np +import onnx + +from onnxruntime.quantization.execution_providers.qnn import qnn_preprocess_model +from onnxruntime.quantization.quant_utils import model_has_external_data, ms_domain + + +class TestQnnPreprocessModel(unittest.TestCase): + def build_model(self, shape, scale_val, bias_val): + """ + Build a model that supports 3 kinds of fusions: + - Erf sequence to Gelu + - ReduceL2 sequence to LpNormalization + - ReduceMean sequence to LayerNormalization + """ + root_inp = onnx.helper.make_tensor_value_info("root", onnx.TensorProto.FLOAT, shape) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape) + + # Erf sequence + one_const = onnx.numpy_helper.from_array(np.array(1.0, dtype=np.float32), "one_const") + half_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "half_const") + root2_const = onnx.numpy_helper.from_array(np.array(math.sqrt(2.0), dtype=np.float32), "root2_const") + + e_mul0_node = onnx.helper.make_node("Mul", ["root", "half_const"], ["e_mul0_out"]) + e_div_node = onnx.helper.make_node("Div", ["root", "root2_const"], ["e_div_out"]) + e_erf_node = onnx.helper.make_node("Erf", ["e_div_out"], ["e_erf_out"]) + e_add_node = onnx.helper.make_node("Add", ["e_erf_out", "one_const"], ["e_add_out"]) + e_mul1_node = onnx.helper.make_node("Mul", ["e_add_out", "e_mul0_out"], ["erf_seq_output"]) + + # ReduceL2 sequence + axes_const = onnx.numpy_helper.from_array(np.array([-1], dtype=np.int64), "axes_const") + eps_const = onnx.numpy_helper.from_array(np.array(1e-12, dtype=np.float32), "eps_const") + shape_const = onnx.numpy_helper.from_array(np.array(list(shape), dtype=np.int64), "shape_const") + + l2_rl2_node = onnx.helper.make_node("ReduceL2", ["erf_seq_output", "axes_const"], ["l2_rl2_out"], keepdims=1) + l2_clip_node = onnx.helper.make_node("Clip", ["l2_rl2_out", "eps_const"], ["l2_clip_out"]) + l2_expand_node = onnx.helper.make_node("Expand", ["l2_clip_out", "shape_const"], ["l2_expand_out"]) + l2_div_node = onnx.helper.make_node("Div", ["erf_seq_output", "l2_expand_out"], ["l2_seq_output"]) + + # ReduceMean sequence + scale_const = onnx.numpy_helper.from_array(np.array(scale_val, dtype=np.float32), "scale_const") + bias_const = onnx.numpy_helper.from_array(np.array(bias_val, dtype=np.float32), "bias_const") + two_const = onnx.numpy_helper.from_array(np.array(2.0, dtype=np.float32), "two_const") + + m_rm0_node = onnx.helper.make_node("ReduceMean", ["l2_seq_output", "axes_const"], ["m_rm0_out"]) + m_sub_node = onnx.helper.make_node("Sub", ["l2_seq_output", "m_rm0_out"], ["m_sub_out"]) + m_pow_node = onnx.helper.make_node("Pow", ["m_sub_out", "two_const"], ["m_pow_out"]) + m_rm1_node = onnx.helper.make_node("ReduceMean", ["m_pow_out", "axes_const"], ["m_rm1_out"]) + m_add0_node = onnx.helper.make_node("Add", ["m_rm1_out", "eps_const"], ["m_add0_out"]) + m_sqrt_node = onnx.helper.make_node("Sqrt", ["m_add0_out"], ["m_sqrt_out"]) + m_div_node = onnx.helper.make_node("Div", ["m_sub_out", "m_sqrt_out"], ["m_div_out"]) + m_mul_node = onnx.helper.make_node("Mul", ["m_div_out", "scale_const"], ["m_mul_out"]) + m_add1_node = onnx.helper.make_node("Add", ["m_mul_out", "bias_const"], ["output"]) + + graph = onnx.helper.make_graph( + [ + e_mul0_node, + e_div_node, + e_erf_node, + e_add_node, + e_mul1_node, + l2_rl2_node, + l2_clip_node, + l2_expand_node, + l2_div_node, + m_rm0_node, + m_sub_node, + m_pow_node, + m_rm1_node, + m_add0_node, + m_sqrt_node, + m_div_node, + m_mul_node, + m_add1_node, + ], + "qnn_f32_model", + [root_inp], + [output], + initializer=[ + one_const, + half_const, + root2_const, + axes_const, + eps_const, + shape_const, + scale_const, + bias_const, + two_const, + ], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + ] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + return onnx.shape_inference.infer_shapes(model) + + def test_all_fusions(self): + """ + Test calling qnn_preprocess_model() with a model that supports all 3 fusions. + """ + model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0]) + onnx.save_model(model, "model.onnx") + modified = qnn_preprocess_model("model.onnx", "model.qnn_pp.onnx", fuse_layernorm=True) + + self.assertTrue(modified) + + fused_model = onnx.load_model("model.qnn_pp.onnx") + + # 3 fused Ops: Gelu, LpNorm, LayerNorm + self.assertEqual(len(fused_model.graph.node), 3) + expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"} + for node in fused_model.graph.node: + self.assertIn(node.op_type, expected_op_types) + + # Should have added "com.microsoft" opset import because we added a Gelu. + ms_domain_opset = next((opset for opset in fused_model.opset_import if opset.domain == ms_domain), None) + self.assertIsNotNone(ms_domain_opset) + self.assertEqual(ms_domain_opset.version, 1) + + def test_external_data(self): + """ + Test calling qnn_preprocess_model() with a model that uses external data. + The new preprocessed model should also have external data. + """ + model = self.build_model((1, 2, 3), [2.0, 2.0, 2.0], [1.0, 1.0, 1.0]) + onnx.save_model( + model, + "model.onnx", + save_as_external_data=True, + all_tensors_to_one_file=True, + location="weights.bin", + size_threshold=0, + ) + modified = qnn_preprocess_model( + "model.onnx", + "model.qnn_pp.onnx", + fuse_layernorm=True, + save_as_external_data=True, + all_tensors_to_one_file=True, + external_data_location="weights2.bin", + external_data_size_threshold=0, + ) + + self.assertTrue(modified) + + # Model should still have external data. + self.assertTrue(model_has_external_data(Path("model.qnn_pp.onnx"))) + + fused_model = onnx.load_model("model.qnn_pp.onnx", load_external_data=False) + + # 3 fused Ops: Gelu, LpNorm, LayerNorm + self.assertEqual(len(fused_model.graph.node), 3) + expected_op_types = {"Gelu", "LpNormalization", "LayerNormalization"} + for node in fused_model.graph.node: + self.assertIn(node.op_type, expected_op_types) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 0470953e385b6..cbb6b3ae2e776 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -555,6 +555,36 @@ def test_get_qnn_qdq_config(self): self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16) self.assertEqual(sig_out_sc.float_data[0], np.float32(1.0 / 65536.0)) + def test_get_qnn_qdq_config_ext_data(self): + """ + Test that get_qnn_qdq_config() returns a config that enables external data + if the input model has external data. + """ + + # Create model with a weight large enough (> 1024 bytes) to be stored externally. + large_weight = onnx.numpy_helper.from_array(np.random.random((1, 32, 32)).astype(np.float32), "weight") + graph = onnx.helper.make_graph( + [onnx.helper.make_node("Add", ["input", "weight"], ["output"])], + "add_ext_data", + [onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, (1, 32, 32))], + [onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, (1, 32, 32))], + initializer=[large_weight], + ) + model = onnx.helper.make_model( + graph, + opset_imports=[onnx.helper.make_opsetid("", 18)], + ) + onnx.save_model( + model, + "add_ext_data.onnx", + save_as_external_data=True, + all_tensors_to_one_file=True, + location="add_ext_data.bin", + ) + + qnn_config = get_qnn_qdq_config("add_ext_data.onnx", DummyDataReader(self.activations)) + self.assertTrue(qnn_config.use_external_data_format) + if __name__ == "__main__": t = TestTensorQuantOverridesOption()