From 09ea419cfe5ceccc6f92b06d7baf7492d7ed4fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 12:03:02 +0300 Subject: [PATCH 1/8] cant to decode the output --- modules/7_nlp/include/model.hpp | 1 + modules/7_nlp/src/model.cpp | 19 ++++++++++++++----- modules/7_nlp/src/tokenizer.cpp | 21 ++++++++++++++++++++- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/modules/7_nlp/include/model.hpp b/modules/7_nlp/include/model.hpp index 52a2f3e..0bf2c9b 100644 --- a/modules/7_nlp/include/model.hpp +++ b/modules/7_nlp/include/model.hpp @@ -14,4 +14,5 @@ class SQuADModel { private: Tokenizer tokenizer; InferenceEngine::InferRequest req; + std::string outputName; }; diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index 254e127..47aac54 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -11,16 +11,23 @@ using namespace InferenceEngine; using namespace cv; using namespace cv::utils::fs; +Blob::Ptr wrapVecToBlob(std::vector v) { + std::vector dims = {1, v.size()}; + return make_shared_blob(TensorDesc(Precision::I32, dims, Layout::NC), (int*)v.data()); +} + SQuADModel::SQuADModel() : tokenizer(join(DATA_FOLDER, "bert-large-uncased-vocab.txt")) { Core ie; // Load deep learning network into memory CNNNetwork net = ie.ReadNetwork(join(DATA_FOLDER, "distilbert.xml"), join(DATA_FOLDER, "distilbert.bin")); - + InputInfo::Ptr inputInfo = net.getInputsInfo()["input.1"]; + // inputInfo->setLayout(Layout::HW); + inputInfo->setPrecision(Precision::I32); + outputName = net.getOutputsInfo().begin()->first; // Initialize runnable object on CPU device ExecutableNetwork execNet = ie.LoadNetwork(net, "CPU"); - // Create a single processing thread req = execNet.CreateInferRequest(); } @@ -39,8 +46,10 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string tokens.push_back("[SEP]"); std::vector indices = tokenizer.tokensToIndices(tokens); - - // TODO: forward indices through the network and return an answer - + Blob::Ptr q = wrapVecToBlob(indices); + req.SetBlob("input.1", q); + req.Infer(); + int* output1 = req.GetBlob(outputName)->buffer(); + int* output2 = req.GetBlob("Squeeze_438")->buffer(); return ""; } diff --git a/modules/7_nlp/src/tokenizer.cpp b/modules/7_nlp/src/tokenizer.cpp index 1ac56b1..e17390a 100644 --- a/modules/7_nlp/src/tokenizer.cpp +++ b/modules/7_nlp/src/tokenizer.cpp @@ -6,7 +6,26 @@ #include std::vector basicTokenize(const std::string& text) { - CV_Error(cv::Error::StsNotImplemented, "basicTokenize"); + std::vector basicTokens; + std::string currToken = ""; + for (auto ch : text) { + if (isspace(ch)) { + if (!currToken.empty()) + basicTokens.push_back(currToken); + currToken = ""; + } else if (ispunct(ch)) { + if (!currToken.empty()) + basicTokens.push_back(currToken); + currToken = ""; currToken += ch; + basicTokens.push_back(currToken); + currToken = ""; + } else { + currToken += tolower(ch); + } + } + if (!currToken.empty()) + basicTokens.push_back(currToken); + return basicTokens; } std::vector wordTokenize(const std::string& word, From 453c178dbad9969124fe8cf1cc19b5b3f027ebd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 13:53:14 +0300 Subject: [PATCH 2/8] finished --- data/my_squad_answer.txt | 1 + data/my_squad_question.txt | 1 + data/my_squad_source.txt | 1 + modules/7_nlp/src/model.cpp | 37 ++++++++++++++++++++++++++++++++----- 4 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 data/my_squad_answer.txt create mode 100644 data/my_squad_question.txt create mode 100644 data/my_squad_source.txt diff --git a/data/my_squad_answer.txt b/data/my_squad_answer.txt new file mode 100644 index 0000000..7e6ac22 --- /dev/null +++ b/data/my_squad_answer.txt @@ -0,0 +1 @@ +copenhagen telephone exchange \ No newline at end of file diff --git a/data/my_squad_question.txt b/data/my_squad_question.txt new file mode 100644 index 0000000..e19758c --- /dev/null +++ b/data/my_squad_question.txt @@ -0,0 +1 @@ +Where did Erlang work? \ No newline at end of file diff --git a/data/my_squad_source.txt b/data/my_squad_source.txt new file mode 100644 index 0000000..e468f07 --- /dev/null +++ b/data/my_squad_source.txt @@ -0,0 +1 @@ +Erlang worked for the Copenhagen Telephone Exchange and wanted to analyze and optimize its operations \ No newline at end of file diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index 47aac54..a500733 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -46,10 +46,37 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string tokens.push_back("[SEP]"); std::vector indices = tokenizer.tokensToIndices(tokens); - Blob::Ptr q = wrapVecToBlob(indices); - req.SetBlob("input.1", q); + Blob::Ptr input = wrapVecToBlob(indices); + req.SetBlob("input.1", input); req.Infer(); - int* output1 = req.GetBlob(outputName)->buffer(); - int* output2 = req.GetBlob("Squeeze_438")->buffer(); - return ""; + float* output1 = req.GetBlob(outputName)->buffer(); + float* output2 = req.GetBlob("Squeeze_438")->buffer(); + float max1 = output1[0], max2 = output2[0]; + int indMax1 = 0, indMax2 = 0; + for (int i = 0; i < 128; i++) { + if (output1[i] > max1) { + max1 = output1[i]; + indMax1 = i; + } + + if (output2[i] > max2) { + max2 = output2[i]; + indMax2 = i; + } + } + + std::string result; + for (int i = indMax1; i < indMax2 + 1; i++) { + std::string word = tokenizer.vocab[indices[i]]; + if (word[0] == '#') { + result.pop_back(); + result += word.substr(2, word.length()); + result += ' '; + } + else { + result += word + ' '; + } + } + result.pop_back(); + return result; } From 124f33bda96844ba36e9ab85dd0eb80b83a13bd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 14:02:34 +0300 Subject: [PATCH 3/8] added include files --- modules/7_nlp/include/tokenizer.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/7_nlp/include/tokenizer.hpp b/modules/7_nlp/include/tokenizer.hpp index 0efd486..5e0383c 100644 --- a/modules/7_nlp/include/tokenizer.hpp +++ b/modules/7_nlp/include/tokenizer.hpp @@ -26,7 +26,8 @@ class Tokenizer { // [maxNumTokens] - if number of input tokens less than this value - fill indices by zeros. std::vector tokensToIndices(const std::vector& tokens, int maxNumTokens=128); -private: +public: std::vector vocab; +private: std::map vocabMap; }; From 62a7d539c681d901228e215e372f8cdd336d9a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 14:15:45 +0300 Subject: [PATCH 4/8] fixed substr --- modules/7_nlp/src/model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index a500733..323ddef 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -70,7 +70,7 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string std::string word = tokenizer.vocab[indices[i]]; if (word[0] == '#') { result.pop_back(); - result += word.substr(2, word.length()); + result += word.substr(2, word.length() - 2); result += ' '; } else { From 0850b881a82f787093aa039114804e12f7b6e5a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 14:32:05 +0300 Subject: [PATCH 5/8] fixed result in getAnswer --- modules/7_nlp/include/tokenizer.hpp | 3 +-- modules/7_nlp/src/model.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modules/7_nlp/include/tokenizer.hpp b/modules/7_nlp/include/tokenizer.hpp index 5e0383c..db0db60 100644 --- a/modules/7_nlp/include/tokenizer.hpp +++ b/modules/7_nlp/include/tokenizer.hpp @@ -26,8 +26,7 @@ class Tokenizer { // [maxNumTokens] - if number of input tokens less than this value - fill indices by zeros. std::vector tokensToIndices(const std::vector& tokens, int maxNumTokens=128); -public: - std::vector vocab; private: std::map vocabMap; + std::vector vocab; }; diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index 323ddef..3bfb956 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -67,7 +67,7 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string std::string result; for (int i = indMax1; i < indMax2 + 1; i++) { - std::string word = tokenizer.vocab[indices[i]]; + std::string word = tokens[i]; if (word[0] == '#') { result.pop_back(); result += word.substr(2, word.length() - 2); From 085656af35d3cfc00cd41a5424dba4459bd1cac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 14:55:17 +0300 Subject: [PATCH 6/8] added checking indMax --- modules/7_nlp/src/model.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index 3bfb956..e1c233d 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -65,16 +65,17 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string } } - std::string result; + std::string result = ""; + CV_CheckLE(indMax1, indMax2, "indMax1 > indMax2"); for (int i = indMax1; i < indMax2 + 1; i++) { std::string word = tokens[i]; if (word[0] == '#') { result.pop_back(); result += word.substr(2, word.length() - 2); - result += ' '; + result += (char)32; } else { - result += word + ' '; + result += word + (char)32; } } result.pop_back(); From 92629b8dbddfeb072cda24b270840425c05daf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 15:05:14 +0300 Subject: [PATCH 7/8] fixed output name --- modules/7_nlp/src/model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index e1c233d..982b392 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -48,8 +48,8 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string std::vector indices = tokenizer.tokensToIndices(tokens); Blob::Ptr input = wrapVecToBlob(indices); req.SetBlob("input.1", input); - req.Infer(); - float* output1 = req.GetBlob(outputName)->buffer(); + req.Infer(); + float* output1 = req.GetBlob("Squeeze_437")->buffer(); float* output2 = req.GetBlob("Squeeze_438")->buffer(); float max1 = output1[0], max2 = output2[0]; int indMax1 = 0, indMax2 = 0; From ee43d7024918981b69417f112187d3e4df02b226 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=A1=D0=B8=D0=B7?= =?UTF-8?q?=D0=BE=D0=B2?= Date: Tue, 21 Jul 2020 15:24:53 +0300 Subject: [PATCH 8/8] fixed wrapVecToBlob --- modules/7_nlp/src/model.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/7_nlp/src/model.cpp b/modules/7_nlp/src/model.cpp index 982b392..d49788d 100644 --- a/modules/7_nlp/src/model.cpp +++ b/modules/7_nlp/src/model.cpp @@ -11,7 +11,7 @@ using namespace InferenceEngine; using namespace cv; using namespace cv::utils::fs; -Blob::Ptr wrapVecToBlob(std::vector v) { +Blob::Ptr wrapVecToBlob(const std::vector& v) { std::vector dims = {1, v.size()}; return make_shared_blob(TensorDesc(Precision::I32, dims, Layout::NC), (int*)v.data()); } @@ -23,7 +23,7 @@ SQuADModel::SQuADModel() : tokenizer(join(DATA_FOLDER, "bert-large-uncased-vocab CNNNetwork net = ie.ReadNetwork(join(DATA_FOLDER, "distilbert.xml"), join(DATA_FOLDER, "distilbert.bin")); InputInfo::Ptr inputInfo = net.getInputsInfo()["input.1"]; - // inputInfo->setLayout(Layout::HW); + inputInfo->setLayout(Layout::NC); inputInfo->setPrecision(Precision::I32); outputName = net.getOutputsInfo().begin()->first; // Initialize runnable object on CPU device @@ -49,8 +49,8 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string Blob::Ptr input = wrapVecToBlob(indices); req.SetBlob("input.1", input); req.Infer(); - float* output1 = req.GetBlob("Squeeze_437")->buffer(); - float* output2 = req.GetBlob("Squeeze_438")->buffer(); + float* output1 = req.GetBlob("Squeeze_437")->buffer().as(); + float* output2 = req.GetBlob("Squeeze_438")->buffer().as(); float max1 = output1[0], max2 = output2[0]; int indMax1 = 0, indMax2 = 0; for (int i = 0; i < 128; i++) { @@ -65,6 +65,7 @@ std::string SQuADModel::getAnswer(const std::string& question, const std::string } } + std::cout << indMax1 << " " << indMax2 << std::endl; std::string result = ""; CV_CheckLE(indMax1, indMax2, "indMax1 > indMax2"); for (int i = indMax1; i < indMax2 + 1; i++) {