pre-alloc remote memory; rebase from vpux/2021/3

luo-cheng2021 · Apr 15, 2021 · 4cdb4bb · 4cdb4bb
1 parent 6f860dd
commit 4cdb4bb
Show file tree

Hide file tree

Showing 9 changed files with 242 additions and 3 deletions.
diff --git a/inference-engine/samples/benchmark_app/CMakeLists.txt b/inference-engine/samples/benchmark_app/CMakeLists.txt
@@ -5,9 +5,23 @@
 file (GLOB SRC ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 file (GLOB HDR ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
 
+find_package(HddlUnite QUIET)
+if(HddlUnite_FOUND)
+    # test kmb_plugin directory
+    if(EXISTS ${KMBPLUGIN_HOME}/include/)
+        set(HDDL2_PARAMS_DIR ${KMBPLUGIN_HOME}/include/)
+        add_definitions(-DUSE_REMOTE_MEM)
+        set(HDDL2_DEP "HddlUnite::HddlUnite")
+    else()
+        message(WARNING "kmb_plugin/include could not find. Preallocating memory in remote device feature is disabled.")
+        remove_definitions(-DUSE_REMOTE_MEM)
+    endif()
+endif()
+
 ie_add_sample(NAME benchmark_app
               SOURCES ${SRC}
               HEADERS ${HDR}
-              DEPENDENCIES format_reader
-              OPENCV_DEPENDENCIES imgcodecs)
+              DEPENDENCIES format_reader ${HDDL2_DEP}
+              OPENCV_DEPENDENCIES imgcodecs
+              INCLUDE_DIRECTORIES ${HDDL2_PARAMS_DIR})
 
diff --git a/inference-engine/samples/benchmark_app/README.md b/inference-engine/samples/benchmark_app/README.md
@@ -93,6 +93,7 @@ Options:
     -progress                 Optional. Show progress bar (can affect performance measurement). Default values is "false".
     -shape                    Optional. Set shape for input. For example, "input1[1,3,224,224],input2[1,4]" or "[1,3,224,224]" in case of one input size.
     -layout                   Optional. Prompts how network layouts should be treated by application. For example, "input1[NCHW],input2[NC]" or "[NCHW]" in case of one input size.
+    -use_remote_mem           Optional. Prealloc remote memory in device to execute infer request.
 
   CPU-specific performance options:
     -nstreams "<integer>"     Optional. Number of streams to use for inference on the CPU, GPU or MYRIAD devices

diff --git a/inference-engine/samples/benchmark_app/benchmark_app.hpp b/inference-engine/samples/benchmark_app/benchmark_app.hpp
@@ -99,6 +99,11 @@ static const char load_config_message[] = "Optional. Path to XML/YAML/JSON file
 static const char dump_config_message[] = "Optional. Path to XML/YAML/JSON file to dump IE parameters, which were set by application.";
 #endif
 
+#ifdef USE_REMOTE_MEM
+// @brief message for preallocing memory option
+static const char use_remote_mem_message[] = "Optional. Prealloc remote memory in device to execute infer request.";
+#endif
+
 static const char shape_message[] = "Optional. Set shape for input. For example, \"input1[1,3,224,224],input2[1,4]\" or \"[1,3,224,224]\""
                                     " in case of one input size.";
 
@@ -202,6 +207,11 @@ DEFINE_string(load_config, "", load_config_message);
 DEFINE_string(dump_config, "", dump_config_message);
 #endif
 
+#ifdef USE_REMOTE_MEM
+/// @brief Define flag for using prealloc memory option <br>
+DEFINE_bool(use_remote_mem, false, use_remote_mem_message);
+#endif
+
 /// @brief Define flag for input shape <br>
 DEFINE_string(shape, "", shape_message);
 
@@ -247,6 +257,9 @@ static void showUsage() {
     std::cout << "    -progress                 " << progress_message << std::endl;
     std::cout << "    -shape                    " << shape_message << std::endl;
     std::cout << "    -layout                   " << layout_message << std::endl;
+#ifdef USE_REMOTE_MEM
+    std::cout << "    -use_remote_mem           " << use_remote_mem_message << std::endl;
+#endif
     std::cout << std::endl << "  device-specific performance options:" << std::endl;
     std::cout << "    -nstreams \"<integer>\"     " << infer_num_streams_message << std::endl;
     std::cout << "    -nthreads \"<integer>\"     " << infer_num_threads_message << std::endl;

diff --git a/inference-engine/samples/benchmark_app/infer_request_wrap.hpp b/inference-engine/samples/benchmark_app/infer_request_wrap.hpp
@@ -65,6 +65,10 @@ class InferReqWrap final {
         return _request.GetBlob(name);
     }
 
+    void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) {
+        _request.SetBlob(name, data);
+    }
+
     double getExecutionTimeInMilliseconds() const {
         auto execTime = std::chrono::duration_cast<ns>(_endTime - _startTime);
         return static_cast<double>(execTime.count()) * 0.000001;

diff --git a/inference-engine/samples/benchmark_app/inputs_filling.cpp b/inference-engine/samples/benchmark_app/inputs_filling.cpp
@@ -12,6 +12,7 @@
 #include <samples/slog.hpp>
 
 #include "inputs_filling.hpp"
+#include "../../src/plugin_api/blob_factory.hpp"
 
 using namespace InferenceEngine;
 
@@ -188,10 +189,19 @@ void fillBlobImInfo(Blob::Ptr& inputBlob,
     }
 }
 
+#ifdef USE_REMOTE_MEM
+void fillBlobs(RemoteContextHelper& remoteContextHelper,
+               const std::vector<std::string>& inputFiles,
+               const size_t& batchSize,
+               benchmark_app::InputsInfo& app_inputs_info,
+               std::vector<InferReqWrap::Ptr> requests,
+               bool preallocImage) {
+#else
 void fillBlobs(const std::vector<std::string>& inputFiles,
                const size_t& batchSize,
                benchmark_app::InputsInfo& app_inputs_info,
                std::vector<InferReqWrap::Ptr> requests) {
+#endif
     std::vector<std::pair<size_t, size_t>> input_image_sizes;
     for (auto& item : app_inputs_info) {
         if (item.second.isImage()) {
@@ -270,6 +280,11 @@ void fillBlobs(const std::vector<std::string>& inputFiles,
                 if (!imageFiles.empty()) {
                     // Fill with Images
                     fillBlobImage(inputBlob, imageFiles, batchSize, app_info, requestId, imageInputId++, imageInputCount);
+#ifdef USE_REMOTE_MEM
+                    if (preallocImage) {
+                        remoteContextHelper.PreallocRemoteMem(requests.at(requestId), item.first, inputBlob);
+                    }
+#endif
                     continue;
                 }
             } else {
@@ -333,6 +348,11 @@ void fillBlobs(const std::vector<std::string>& inputFiles,
             } else {
                 THROW_IE_EXCEPTION << "Input precision is not supported for " << item.first;
             }
+#ifdef USE_REMOTE_MEM
+            if (preallocImage) {
+                remoteContextHelper.PreallocRemoteMem(requests.at(requestId), item.first, inputBlob);
+            }
+#endif
         }
     }
 }
diff --git a/inference-engine/samples/benchmark_app/inputs_filling.hpp b/inference-engine/samples/benchmark_app/inputs_filling.hpp
@@ -12,7 +12,17 @@
 #include "utils.hpp"
 #include "infer_request_wrap.hpp"
 
+#include "remotecontext_helper.hpp"
+
+#ifdef USE_REMOTE_MEM
+void fillBlobs(RemoteContextHelper& remoteContextHelper,
+               const std::vector<std::string>& inputFiles,
+               const size_t& batchSize,
+               benchmark_app::InputsInfo& app_inputs_info,
+               std::vector<InferReqWrap::Ptr> requests, bool preallocImage);
+#else
 void fillBlobs(const std::vector<std::string>& inputFiles,
                const size_t& batchSize,
                benchmark_app::InputsInfo& app_inputs_info,
                std::vector<InferReqWrap::Ptr> requests);
+#endif
diff --git a/inference-engine/samples/benchmark_app/main.cpp b/inference-engine/samples/benchmark_app/main.cpp
@@ -24,6 +24,7 @@
 #include "statistics_report.hpp"
 #include "inputs_filling.hpp"
 #include "utils.hpp"
+#include "remotecontext_helper.hpp"
 
 using namespace InferenceEngine;
 
@@ -75,6 +76,11 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
 
         throw std::logic_error(err);
     }
+#ifdef USE_REMOTE_MEM
+    if (FLAGS_use_remote_mem && FLAGS_d != "VPUX") {
+        throw std::logic_error("Incorrect device name. Using remote memory feature must set device name to VPUX.");
+    }
+#endif
     return true;
 }
 
@@ -117,8 +123,13 @@ T getMedianValue(const std::vector<T> &vec) {
 int main(int argc, char *argv[]) {
     std::shared_ptr<StatisticsReport> statistics;
     try {
+        Core ie;
         ExecutableNetwork exeNetwork;
 
+#ifdef USE_REMOTE_MEM
+        RemoteContextHelper remoteContextHelper;
+#endif
+
         // ----------------- 1. Parsing and validating input arguments -------------------------------------------------
         next_step();
 
@@ -170,7 +181,6 @@ int main(int argc, char *argv[]) {
         // ----------------- 2. Loading the Inference Engine -----------------------------------------------------------
         next_step();
 
-        Core ie;
         if (FLAGS_d.find("CPU") != std::string::npos && !FLAGS_l.empty()) {
             // CPU (MKLDNN) extensions is loaded as a shared library and passed as a pointer to base extension
             const auto extension_ptr = InferenceEngine::make_so_pointer<InferenceEngine::IExtension>(FLAGS_l);
@@ -185,6 +195,11 @@ int main(int argc, char *argv[]) {
                 config["GPU"] = {};
             config["GPU"][CONFIG_KEY(CONFIG_FILE)] = FLAGS_c;
         }
+#ifdef USE_REMOTE_MEM
+        if (FLAGS_d.find("VPUX") != std::string::npos) {
+            remoteContextHelper.Init(ie);
+        }
+#endif
         if (config.count("GPU") && config.at("GPU").count(CONFIG_KEY(CONFIG_FILE))) {
             auto ext = config.at("GPU").at(CONFIG_KEY(CONFIG_FILE));
             ie.SetConfig({{ CONFIG_KEY(CONFIG_FILE), ext }}, "GPU");
@@ -395,7 +410,15 @@ int main(int argc, char *argv[]) {
             // ----------------- 7. Loading the model to the device --------------------------------------------------------
             next_step();
             startTime = Time::now();
+#ifdef USE_REMOTE_MEM
+            if (FLAGS_use_remote_mem == false) {
+                exeNetwork = ie.LoadNetwork(cnnNetwork, device_name);
+            } else {
+                exeNetwork = ie.LoadNetwork(cnnNetwork, remoteContextHelper.getRemoteContext());
+            }
+#else
             exeNetwork = ie.LoadNetwork(cnnNetwork, device_name);
+#endif
             duration_ms = double_to_string(get_total_ms_time(startTime));
             slog::info << "Load network took " << duration_ms << " ms" << slog::endl;
             if (statistics)
@@ -413,7 +436,20 @@ int main(int argc, char *argv[]) {
             // ----------------- 7. Loading the model to the device --------------------------------------------------------
             next_step();
             auto startTime = Time::now();
+#ifdef USE_REMOTE_MEM
+            if (FLAGS_use_remote_mem == false) {
+                exeNetwork = ie.ImportNetwork(FLAGS_m, device_name, {});
+            } else {
+                std::filebuf blobFile;
+                if (!blobFile.open(FLAGS_m, std::ios::in | std::ios::binary)) {
+                    THROW_IE_EXCEPTION << "Could not open file: " << FLAGS_m;
+                }
+                std::istream graphBlob(&blobFile);
+                exeNetwork = ie.ImportNetwork(graphBlob, remoteContextHelper.getRemoteContext());
+            }
+#else
             exeNetwork = ie.ImportNetwork(FLAGS_m, device_name, {});
+#endif
             auto duration_ms = double_to_string(get_total_ms_time(startTime));
             slog::info << "Import network took " << duration_ms << " ms" << slog::endl;
             if (statistics)
@@ -500,7 +536,11 @@ int main(int argc, char *argv[]) {
         next_step();
 
         InferRequestsQueue inferRequestsQueue(exeNetwork, nireq);
+#ifdef USE_REMOTE_MEM
+        fillBlobs(remoteContextHelper, inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests, FLAGS_use_remote_mem);
+#else
         fillBlobs(inputFiles, batchSize, app_inputs_info, inferRequestsQueue.requests);
+#endif
 
         // ----------------- 10. Measuring performance ------------------------------------------------------------------
         size_t progressCnt = 0;

diff --git a/inference-engine/samples/benchmark_app/remotecontext_helper.cpp b/inference-engine/samples/benchmark_app/remotecontext_helper.cpp
@@ -0,0 +1,111 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef USE_REMOTE_MEM
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include <map>
+#include <regex>
+
+#include <inference_engine.hpp>
+
+#include <samples/common.hpp>
+#include <samples/slog.hpp>
+
+#include "utils.hpp"
+#include "remotecontext_helper.hpp"
+#include "WorkloadContext.h"
+#include "RemoteMemory.h"
+#include "hddl2/hddl2_params.hpp"
+#include "ie_compound_blob.h"
+
+using namespace InferenceEngine;
+
+class RemoteContextHelper::Impl {
+    WorkloadID _workloadId = -1;
+    HddlUnite::WorkloadContext::Ptr _context;
+    RemoteContext::Ptr _contextPtr;
+    bool _init = false;
+
+public:
+    void Init(InferenceEngine::Core& ie) {
+        _context = HddlUnite::createWorkloadContext();
+        _context->setContext(_workloadId);
+        auto ret = registerWorkloadContext(_context);
+        if (ret != HddlStatusCode::HDDL_OK) {
+            THROW_IE_EXCEPTION << "registerWorkloadContext failed with " << ret;
+        }
+
+        // init context map and create context based on it
+        ParamMap paramMap = { {HDDL2_PARAM_KEY(WORKLOAD_CONTEXT_ID), _workloadId} };
+        _contextPtr = ie.CreateContext("VPUX", paramMap);
+        _init = true;
+    }
+
+    HddlUnite::RemoteMemory::Ptr allocateRemoteMemory(const void* data, const size_t& dataSize) {
+        auto remoteFrame = std::make_shared<HddlUnite::RemoteMemory>(*_context,
+            HddlUnite::RemoteMemoryDesc(dataSize, 1, dataSize, 1));
+
+        if (remoteFrame == nullptr) {
+            THROW_IE_EXCEPTION << "Failed to allocate remote memory.";
+        }
+
+        if (remoteFrame->syncToDevice(data, dataSize) != HDDL_OK) {
+            THROW_IE_EXCEPTION << "Failed to sync memory to device.";
+        }
+        return remoteFrame;
+    }
+
+    void PreallocRemoteMem(InferReqWrap::Ptr& request,
+        const std::string& inputBlobName,
+        const Blob::Ptr& inputBlob) {
+        if (_init == false)
+            THROW_IE_EXCEPTION << "RemoteContextHelper did not init.";
+        MemoryBlob::Ptr minput = as<MemoryBlob>(inputBlob);
+        const TensorDesc& inputTensor = minput->getTensorDesc();
+        // locked memory holder should be alive all time while access to its buffer happens
+        auto minputHolder = minput->rmap();
+        auto inputBlobData = minputHolder.as<uint8_t*>();
+
+        // 1, allocate memory with HddlUnite on device
+        auto remoteMemory = allocateRemoteMemory(inputBlobData, minput->byteSize());
+
+        // 2, create remote blob by using already exists remote memory and specify color format of it
+        ParamMap blobParamMap = { {HDDL2_PARAM_KEY(REMOTE_MEMORY), remoteMemory} };
+        RemoteBlob::Ptr remoteBlobPtr = _contextPtr->CreateBlob(inputTensor, blobParamMap);
+        if (remoteBlobPtr == nullptr) {
+            THROW_IE_EXCEPTION << "CreateBlob failed.";
+        }
+
+        // 3, set remote blob
+        request->setBlob(inputBlobName, remoteBlobPtr);
+    }
+
+    RemoteContext::Ptr getRemoteContext() {
+        if (_init == false)
+            THROW_IE_EXCEPTION << "RemoteContextHelper did not init.";
+        return _contextPtr;
+    }
+};
+
+RemoteContextHelper::RemoteContextHelper() : _impl(new RemoteContextHelper::Impl()) {
+}
+
+RemoteContextHelper::~RemoteContextHelper() {
+}
+
+void RemoteContextHelper::Init(InferenceEngine::Core& ie) {
+    _impl->Init(ie);
+}
+
+void RemoteContextHelper::PreallocRemoteMem(InferReqWrap::Ptr& request, const std::string& inputBlobName, const Blob::Ptr& inputBlob) {
+    _impl->PreallocRemoteMem(request, inputBlobName, inputBlob);
+}
+
+InferenceEngine::RemoteContext::Ptr RemoteContextHelper::getRemoteContext() {
+    return _impl->getRemoteContext();
+}
+#endif
diff --git a/inference-engine/samples/benchmark_app/remotecontext_helper.hpp b/inference-engine/samples/benchmark_app/remotecontext_helper.hpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <inference_engine.hpp>
+#include "infer_request_wrap.hpp"
+
+class RemoteContextHelper {
+    class Impl;
+    std::unique_ptr<Impl> _impl;
+public:
+    RemoteContextHelper();
+    ~RemoteContextHelper();
+
+    void Init(InferenceEngine::Core& ie);
+    void PreallocRemoteMem(InferReqWrap::Ptr& request,
+        const std::string& inputBlobName,
+        const InferenceEngine::Blob::Ptr& inputBlob);
+    InferenceEngine::RemoteContext::Ptr getRemoteContext();
+};