Skip to content

Commit

Permalink
Place shape related compute nodes in CPU (#4940) (#5350)
Browse files Browse the repository at this point in the history
* Place shape related nodes in CPU
* visit candidates by topological order
* Make CPU node placement a utility function
* skip placing on CPU if the data typs is float16 or bfloat16

Co-authored-by: Sherlock <[email protected]>
  • Loading branch information
ashbhandare and SherlockNoMad authored Oct 2, 2020
1 parent 5de47af commit 38e1bbc
Show file tree
Hide file tree
Showing 3 changed files with 184 additions and 56 deletions.
12 changes: 12 additions & 0 deletions include/onnxruntime/core/graph/graph_viewer.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,18 @@ class GraphViewer {
/** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
const Node* ParentNode() const noexcept { return graph_->ParentNode(); }

#if !defined(ORT_MINIMAL_BUILD)
/** Get the consumer nodes of a node arg */
std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
return graph_->GetConsumerNodes(node_arg_name);
}

/** Get the producer node of a node arg */
const Node* GetProducerNode(const std::string& node_arg_name) const {
return graph_->GetProducerNode(node_arg_name);
}
#endif

private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);

Expand Down
153 changes: 153 additions & 0 deletions onnxruntime/core/framework/fallback_cpu_capability.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once
#include "core/graph/graph_viewer.h"
#include "onnx/defs/data_type_utils.h"
#include <queue>

using namespace ONNX_NAMESPACE::Utils;

namespace onnxruntime {

namespace {
const int64_t Small_Initializer_Threshold = 100;

bool IsSmallInitializerWithSingleConsumer(const onnxruntime::GraphViewer& graph, const NodeArg* arg) {
const ONNX_NAMESPACE::TensorProto* initializer_tensor;
if (!graph.GetInitializedTensor(arg->Name(), initializer_tensor))
return false;
int64_t size = 1;
for (auto& dim : initializer_tensor->dims()) {
size *= dim;
}
return size <= Small_Initializer_Threshold &&
graph.GetConsumerNodes(arg->Name()).size() == 1;
}
} // namespace

/**
Returns a list of nodes that are prefered on CPU.
They are commonly shape-related computation subgraphs.
@param graph Graph viewer
@param provider_type The targe execution provider type
@param kernel_registries Kernel registies for the target EP
@param tentative_nodes Nodes that are tentative to be placed on on target EP
*/
std::unordered_set<NodeIndex> GetCpuPreferedNodes(const onnxruntime::GraphViewer& graph,
const std::string& provider_type,
const std::vector<const KernelRegistry*>& kernel_registries,
const std::vector<NodeIndex>& tentative_nodes) {
const std::vector<NodeIndex>& ordered_nodes = graph.GetNodesInTopologicalOrder();
std::vector<size_t> node_id_to_order_map(graph.MaxNodeIndex());
for (size_t id = 0; id < ordered_nodes.size(); ++id) {
const NodeIndex& node_id = ordered_nodes[id];
node_id_to_order_map[node_id] = id;
}

// If return false, n1 will be output first; If return true, n2 will be output first
auto greater_order_comp = [&](const NodeIndex n1, const NodeIndex n2) {
return node_id_to_order_map[n1] > node_id_to_order_map[n2];
};

std::priority_queue<NodeIndex, std::vector<NodeIndex>, decltype(greater_order_comp)> candidates(greater_order_comp);
std::unordered_set<NodeIndex> visited;

std::unordered_set<const NodeArg*> cpu_output_args;
std::unordered_set<NodeIndex> provider_nodes;
std::unordered_map<NodeIndex, const KernelCreateInfo*> node_to_kernel;

for (auto& node_id : tentative_nodes) {
provider_nodes.insert(node_id);
const Node* node = graph.GetNode(node_id);

const KernelCreateInfo* kernel_info = nullptr;
for (auto registry : kernel_registries) {
auto st = registry->TryFindKernel(*node, provider_type, &kernel_info);
if (st.IsOK())
break;
}
// at least one registry has a target provider's kernel for this node
ORT_ENFORCE(kernel_info != nullptr);
node_to_kernel.insert({node_id, kernel_info});

// first, find all the direct consumer of cpu tensors.
ORT_THROW_IF_ERROR(node->ForEachWithIndex(
node->OutputDefs(),
[&](const NodeArg& node_arg, size_t out_index) {
if (kernel_info->kernel_def->IsOutputOnCpu(out_index)) {
cpu_output_args.insert(&node_arg);
auto consumer_nodes = graph.GetConsumerNodes(node_arg.Name());
for (auto& consumer_node : consumer_nodes) {
candidates.push(consumer_node->Index());
LOGS_DEFAULT(INFO) << "Canditiate for fallback CPU execution: " << consumer_node->Name();
}
}
return Status::OK();
}));
}

const std::vector<const NodeArg*>& graph_inputs = graph.GetInputs();
std::unordered_set<NodeIndex> cpu_nodes;
// The algo below is trying to identity a subgraph that only depends on cpu tensors.
// Usually it is a subgraph that doing shape calculation based on a GPU tensor, then reshape it back.
// The detail:
// for each candidate, if one of its input is a cpu tensor and the Non-CPU kernel doesn't mark it as cpu input,
// force the node to CPU to avoid memory cpu and add its output to the small cpu tensors.
while (!candidates.empty()) {
NodeIndex cur = candidates.top();
candidates.pop();
if (visited.count(cur) != 0)
continue;
visited.insert(cur);

if (provider_nodes.find(cur) == provider_nodes.end())
continue;

auto* node = graph.GetNode(cur);
bool place_in_cpu = true;
for (size_t i = 0; i < node->InputDefs().size(); ++i) {
auto* input = node->InputDefs()[i];

// skip placing on CPU if the data typs is float16 or bfloat16
if (input->Type() == DataTypeUtils::ToType("float16") ||
input->Type() == DataTypeUtils::ToType("bfloat16")) {
place_in_cpu = false;
break;
}

// allow placing on CPU if it's a small initializer or graph input
if (IsSmallInitializerWithSingleConsumer(graph, input) ||
std::find(graph_inputs.begin(), graph_inputs.end(), input) != graph_inputs.end()) {
continue;
}

// the input is not a CPU tensor
if (cpu_output_args.find(input) == cpu_output_args.end()) {
place_in_cpu = false;
break;
}

// input is a CPU tensor, but it's intended to be consumed as CPU input by the target EP
if (node_to_kernel[cur]->kernel_def->IsInputOnCpu(i)) {
place_in_cpu = false;
break;
}
}

if (place_in_cpu) {
cpu_nodes.insert(cur);
LOGS_DEFAULT(WARNING) << "Force fallback to CPU execution for node: " << node->Name();
for (auto* output : node->OutputDefs()) {
cpu_output_args.insert(output);
}
for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) {
candidates.push((*it).Index());
}
}
}

return cpu_nodes;
}

} // namespace onnxruntime
75 changes: 19 additions & 56 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "cuda_allocator.h"
#include "core/framework/kernel_registry.h"
#include "core/framework/compute_capability.h"
#include "core/framework/fallback_cpu_capability.h"
#include "core/framework/memcpy.h"
#include "core/graph/graph_utils.h"
#include "core/providers/cuda/gpu_data_transfer.h"
Expand Down Expand Up @@ -1822,9 +1823,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> CUDAExecutionProvider::GetDataTransf
std::vector<std::unique_ptr<ComputeCapability>>
CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
const std::vector<const KernelRegistry*>& kernel_registries) const {
std::vector<std::unique_ptr<ComputeCapability>> result;
std::unordered_set<const NodeArg*> defs_outside_cuda;

std::vector<NodeIndex> candidates;
for (auto& node_index : graph.GetNodesInTopologicalOrder()) {
const auto* p_node = graph.GetNode(node_index);
if (p_node == nullptr)
Expand All @@ -1833,7 +1832,6 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
const auto& node = *p_node;
const KernelCreateInfo* cuda_kernel_def = nullptr;
if (!node.GetExecutionProviderType().empty()) {
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
continue;
}

Expand All @@ -1847,14 +1845,10 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,

// none of the provided registries has a CUDA kernel for this node
if (cuda_kernel_def == nullptr) {
// node is not in cuda exeuction provider if no kernel def found,
// or if other execution provider already assigned to it
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
continue;
}

bool not_supported = false;
bool force_outside = false;
bool force_inside = false; // for some compute heavy ops, we'll force it to run inside CUDA
if ("LSTM" == node.OpType()) {
// the supported activations covers the bidirectional mode
Expand All @@ -1877,60 +1871,29 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
// cast is not compute heavy, and may be placed outside
}

//Below rule only works for inference, for training, we can't do constant folding.
//We need find a better solution.
//Temporary disable the check here, the cost is all the cast will be on GPU now.
#ifndef ENABLE_TRAINING
if (!not_supported && !force_inside) {
// Note that nodes with only inputs from initializer would not be place on CUDA
// Ideally, those nodes should be eliminated in constant folding
bool should_force_outside = true;
bool all_inputs_are_initializers = true;
ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(),
[&](const NodeArg& def, size_t index) {
// The input is not a initializer and the input is from CPU
// or the input declared as CPU memory and is from CPU
// in that case we should still keep the node on CUDA
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
if ((!initializer_input && !input_is_on_cpu) ||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
should_force_outside = false;
}

if (!initializer_input) {
all_inputs_are_initializers = false;
}
return Status::OK();
}));

// If all the inputs are initializers, we shouldn't force it to CPU
if (should_force_outside && !all_inputs_are_initializers) {
force_outside = true;
}
}
#endif
if (!force_inside && (not_supported || force_outside)) {
defs_outside_cuda.insert(node.OutputDefs().cbegin(), node.OutputDefs().cend());
if (!force_inside && not_supported) {
if (not_supported) {
LOGS_DEFAULT(WARNING) << "CUDA kernel not supported. Fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
} else if (force_outside) {
LOGS_DEFAULT(INFO) << "Force fallback to CPU execution provider for Op type: " << node.OpType() << " node name: " << node.Name();
}
} else {
// for nodes placed on CUDA, check if its output is on CPU
ORT_THROW_IF_ERROR(node.ForEachWithIndex(
node.OutputDefs(),
[&](const NodeArg& def, size_t out_index) {
if (cuda_kernel_def->kernel_def->OutputMemoryType(out_index) != OrtMemTypeDefault)
defs_outside_cuda.insert(&def);
return Status::OK();
}));
std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
sub_graph->nodes.push_back(node.Index());
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
candidates.push_back(node.Index());
}
}

// For CUDA EP, exclude the subgraph that is preferred to be placed in CPU
// These are usually shape related computation subgraphs
// Following logic can be extended for other EPs
std::unordered_set<NodeIndex> cpu_nodes = GetCpuPreferedNodes(graph, Type(), kernel_registries, candidates);

std::vector<std::unique_ptr<ComputeCapability>> result;
for (auto& node_index : candidates) {
if (cpu_nodes.count(node_index) > 0)
continue;

std::unique_ptr<IndexedSubGraph> sub_graph = onnxruntime::make_unique<IndexedSubGraph>();
sub_graph->nodes.push_back(node_index);
result.push_back(onnxruntime::make_unique<ComputeCapability>(std::move(sub_graph)));
}
return result;
}

Expand Down

0 comments on commit 38e1bbc

Please sign in to comment.