From 210c2e9b8a080df99c0095291f17ecd29bd0b592 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 24 Dec 2024 13:38:39 +0200 Subject: [PATCH 01/10] ff --- src/rs.cpp | 93 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 26 deletions(-) diff --git a/src/rs.cpp b/src/rs.cpp index 6add8361f1..67a1756677 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -64,6 +64,10 @@ #include #include +#include +#include +#include // For std::this_thread::sleep_for + //////////////////////// // API implementation // //////////////////////// @@ -4106,6 +4110,19 @@ void rs2_project_point_to_pixel(float pixel[2], const struct rs2_intrinsics* int } NOEXCEPT_RETURN(, pixel) +/* return true in case all intrinsics distortion coefficient are zero. + otherwise, return false.*/ +const int DISTORTION_COEFF = 5; +inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) +{ + for(int i = 0;i < DISTORTION_COEFF;i++) + { + if (abs(intrin->coeffs[i]) > std::numeric_limits::epsilon()) + return false; + } + return true; +} + void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL { assert(intrin->model != RS2_DISTORTION_MODIFIED_BROWN_CONRADY); // Cannot deproject from a forward-distorted image @@ -4117,39 +4134,53 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i float xo = x; float yo = y; - if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY) + // Get the starting time point + auto start = std::chrono::high_resolution_clock::now(); + + std::cout << "rs2_deproject_pixel_to_point start" << std::endl; + + if(!is_intrinsics_distortion_zero(intrin)) { - // need to loop until convergence - // 10 iterations determined empirically - for (int i = 0; i < 10; i++) + std::cout << "rs2_deproject_pixel_to_point 1" << std::endl; + + + if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY) { - float r2 = x * x + y * y; - float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); - float xq = x / icdist; - float yq = y / icdist; - float delta_x = 2 * intrin->coeffs[2] * xq * yq + intrin->coeffs[3] * (r2 + 2 * xq * xq); - float delta_y = 2 * intrin->coeffs[3] * xq * yq + intrin->coeffs[2] * (r2 + 2 * yq * yq); - x = (xo - delta_x) * icdist; - y = (yo - delta_y) * icdist; + std::cout << "rs2_deproject_pixel_to_point 2" << std::endl; + // need to loop until convergence + // 10 iterations determined empirically + for (int i = 0; i < 10; i++) + { + float r2 = x * x + y * y; + float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); + float xq = x / icdist; + float yq = y / icdist; + float delta_x = 2 * intrin->coeffs[2] * xq * yq + intrin->coeffs[3] * (r2 + 2 * xq * xq); + float delta_y = 2 * intrin->coeffs[3] * xq * yq + intrin->coeffs[2] * (r2 + 2 * yq * yq); + x = (xo - delta_x) * icdist; + y = (yo - delta_y) * icdist; + } + } - } - if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) - { - // need to loop until convergence - // 10 iterations determined empirically - for (int i = 0; i < 10; i++) + if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) { - float r2 = x * x + y * y; - float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); - float delta_x = 2 * intrin->coeffs[2] * x * y + intrin->coeffs[3] * (r2 + 2 * x * x); - float delta_y = 2 * intrin->coeffs[3] * x * y + intrin->coeffs[2] * (r2 + 2 * y * y); - x = (xo - delta_x) * icdist; - y = (yo - delta_y) * icdist; + std::cout << "rs2_deproject_pixel_to_point 3" << std::endl; + // need to loop until convergence + // 10 iterations determined empirically + for (int i = 0; i < 10; i++) + { + float r2 = x * x + y * y; + float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); + float delta_x = 2 * intrin->coeffs[2] * x * y + intrin->coeffs[3] * (r2 + 2 * x * x); + float delta_y = 2 * intrin->coeffs[3] * x * y + intrin->coeffs[2] * (r2 + 2 * y * y); + x = (xo - delta_x) * icdist; + y = (yo - delta_y) * icdist; + } } - - } + } if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) { + std::cout << "rs2_deproject_pixel_to_point 4" << std::endl; float rd = sqrtf(x * x + y * y); if (rd < FLT_EPSILON) { @@ -4175,6 +4206,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_FTHETA) { + std::cout << "rs2_deproject_pixel_to_point 5" << std::endl; float rd = sqrtf(x * x + y * y); if (rd < FLT_EPSILON) { @@ -4188,6 +4220,15 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i point[0] = depth * x; point[1] = depth * y; point[2] = depth; + + // Get the ending time point + auto end = std::chrono::high_resolution_clock::now(); + + // Calculate the elapsed time in milliseconds + auto duration = std::chrono::duration_cast(end - start); + + // Output the elapsed time + std::cout << "rs2_deproject_pixel_to_point Elapsed time: " << duration.count() << " milliseconds" << std::endl; } NOEXCEPT_RETURN(, point) From 89b95f87ec5a98159492e67559de2726fea5e065 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 24 Dec 2024 16:04:20 +0200 Subject: [PATCH 02/10] before trying the exact code of the submitter --- examples/measure/rs-measure.cpp | 17 ++++++++++++++ src/rs.cpp | 39 +++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp index 9109c82fd5..3de67dda88 100644 --- a/examples/measure/rs-measure.cpp +++ b/examples/measure/rs-measure.cpp @@ -15,6 +15,7 @@ #include #include #include +#include using pixel = std::pair; @@ -100,6 +101,7 @@ void render_simple_distance(const rs2::depth_frame& depth, int main(int argc, char * argv[]) try { + rs2::log_to_console(RS2_LOG_SEVERITY_ERROR); auto settings = rs2::cli( "rs-measure example" ) .process( argc, argv ); @@ -291,9 +293,24 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v) // Deproject from pixel to point in 3D rs2_intrinsics intr = frame.get_profile().as().get_intrinsics(); // Calibration data + // Get the starting time point + auto start = std::chrono::high_resolution_clock::now(); + rs2_deproject_pixel_to_point(upoint, &intr, upixel, udist); + + // Get the ending time point + auto end = std::chrono::high_resolution_clock::now(); + + // Calculate the elapsed time in milliseconds + auto duration = std::chrono::duration_cast(end - start); + + // Output the elapsed time + LOG_ERROR(duration.count()); + rs2_deproject_pixel_to_point(vpoint, &intr, vpixel, vdist); + + // Calculate euclidean distance between the two points return sqrt(pow(upoint[0] - vpoint[0], 2.f) + pow(upoint[1] - vpoint[1], 2.f) + diff --git a/src/rs.cpp b/src/rs.cpp index 67a1756677..f48e061620 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -67,6 +67,7 @@ #include #include #include // For std::this_thread::sleep_for +#include // For random number generation //////////////////////// // API implementation // @@ -4113,14 +4114,22 @@ NOEXCEPT_RETURN(, pixel) /* return true in case all intrinsics distortion coefficient are zero. otherwise, return false.*/ const int DISTORTION_COEFF = 5; -inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) +/*inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { for(int i = 0;i < DISTORTION_COEFF;i++) { if (abs(intrin->coeffs[i]) > std::numeric_limits::epsilon()) return false; } - return true; + //return true; + return false; +}*/ + +inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) +{ + return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && + abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && + abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); } void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL @@ -4135,18 +4144,23 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i float yo = y; // Get the starting time point - auto start = std::chrono::high_resolution_clock::now(); + //auto start = std::chrono::high_resolution_clock::now(); - std::cout << "rs2_deproject_pixel_to_point start" << std::endl; + // Use const_cast to remove the const qualifier + //rs2_intrinsics* non_const_intrin = const_cast(intrin); + + //non_const_intrin->coeffs[0] = 0.01; + //non_const_intrin->coeffs[1] = 0.02; + //non_const_intrin->coeffs[2] = 0.03; + //non_const_intrin->coeffs[3] = 0.04; + //non_const_intrin->coeffs[4] = 0.05; + if(!is_intrinsics_distortion_zero(intrin)) - { - std::cout << "rs2_deproject_pixel_to_point 1" << std::endl; - + { if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY) { - std::cout << "rs2_deproject_pixel_to_point 2" << std::endl; // need to loop until convergence // 10 iterations determined empirically for (int i = 0; i < 10; i++) @@ -4164,7 +4178,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) { - std::cout << "rs2_deproject_pixel_to_point 3" << std::endl; // need to loop until convergence // 10 iterations determined empirically for (int i = 0; i < 10; i++) @@ -4180,7 +4193,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) { - std::cout << "rs2_deproject_pixel_to_point 4" << std::endl; float rd = sqrtf(x * x + y * y); if (rd < FLT_EPSILON) { @@ -4206,7 +4218,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_FTHETA) { - std::cout << "rs2_deproject_pixel_to_point 5" << std::endl; float rd = sqrtf(x * x + y * y); if (rd < FLT_EPSILON) { @@ -4222,13 +4233,13 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i point[2] = depth; // Get the ending time point - auto end = std::chrono::high_resolution_clock::now(); + //auto end = std::chrono::high_resolution_clock::now(); // Calculate the elapsed time in milliseconds - auto duration = std::chrono::duration_cast(end - start); + //auto duration = std::chrono::duration_cast(end - start); // Output the elapsed time - std::cout << "rs2_deproject_pixel_to_point Elapsed time: " << duration.count() << " milliseconds" << std::endl; + //std::cout << duration.count() << std::endl; } NOEXCEPT_RETURN(, point) From 4183a3f4cb9246e3cb0adce8a901ef15cfa0b36b Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 24 Dec 2024 16:43:29 +0200 Subject: [PATCH 03/10] after applying the exact code from Wen - still I do not see an improvement rather a decrease in performance. --- src/rs.cpp | 101 +++++++++++++++++++---------------------------------- 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/src/rs.cpp b/src/rs.cpp index f48e061620..0876d055ed 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4110,21 +4110,7 @@ void rs2_project_point_to_pixel(float pixel[2], const struct rs2_intrinsics* int pixel[1] = y * intrin->fy + intrin->ppy; } NOEXCEPT_RETURN(, pixel) - -/* return true in case all intrinsics distortion coefficient are zero. - otherwise, return false.*/ -const int DISTORTION_COEFF = 5; -/*inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) -{ - for(int i = 0;i < DISTORTION_COEFF;i++) - { - if (abs(intrin->coeffs[i]) > std::numeric_limits::epsilon()) - return false; - } - //return true; - return false; -}*/ - +/* Helper inner function (not part of the API) */ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && @@ -4135,33 +4121,18 @@ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL { assert(intrin->model != RS2_DISTORTION_MODIFIED_BROWN_CONRADY); // Cannot deproject from a forward-distorted image - //assert(intrin->model != RS2_DISTORTION_BROWN_CONRADY); // Cannot deproject to an brown conrady model - + float x = (pixel[0] - intrin->ppx) / intrin->fx; float y = (pixel[1] - intrin->ppy) / intrin->fy; float xo = x; float yo = y; - // Get the starting time point - //auto start = std::chrono::high_resolution_clock::now(); - - // Use const_cast to remove the const qualifier - //rs2_intrinsics* non_const_intrin = const_cast(intrin); - - //non_const_intrin->coeffs[0] = 0.01; - //non_const_intrin->coeffs[1] = 0.02; - //non_const_intrin->coeffs[2] = 0.03; - //non_const_intrin->coeffs[3] = 0.04; - //non_const_intrin->coeffs[4] = 0.05; - - - if(!is_intrinsics_distortion_zero(intrin)) - { - + if (!is_intrinsics_distortion_zero(intrin)) + { if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY) { - // need to loop until convergence + // need to loop until convergence // 10 iterations determined empirically for (int i = 0; i < 10; i++) { @@ -4174,11 +4145,10 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i x = (xo - delta_x) * icdist; y = (yo - delta_y) * icdist; } - } if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) { - // need to loop until convergence + // need to loop until convergence // 10 iterations determined empirically for (int i = 0; i < 10; i++) { @@ -4189,43 +4159,44 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i x = (xo - delta_x) * icdist; y = (yo - delta_y) * icdist; } + } - } - if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) - { - float rd = sqrtf(x * x + y * y); - if (rd < FLT_EPSILON) + if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) { - rd = FLT_EPSILON; - } + float rd = sqrtf(x * x + y * y); + if (rd < FLT_EPSILON) + { + rd = FLT_EPSILON; + } - float theta = rd; - float theta2 = rd * rd; - for (int i = 0; i < 4; i++) - { - float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd; - if (fabs(f) < FLT_EPSILON) + float theta = rd; + float theta2 = rd * rd; + for (int i = 0; i < 4; i++) { - break; + float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd; + if (fabs(f) < FLT_EPSILON) + { + break; + } + float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3]))); + theta -= f / df; + theta2 = theta * theta; } - float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3]))); - theta -= f / df; - theta2 = theta * theta; + float r = tan(theta); + x *= r / rd; + y *= r / rd; } - float r = tan(theta); - x *= r / rd; - y *= r / rd; - } - if (intrin->model == RS2_DISTORTION_FTHETA) - { - float rd = sqrtf(x * x + y * y); - if (rd < FLT_EPSILON) + if (intrin->model == RS2_DISTORTION_FTHETA) { - rd = FLT_EPSILON; + float rd = sqrtf(x * x + y * y); + if (rd < FLT_EPSILON) + { + rd = FLT_EPSILON; + } + float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); + x *= r / rd; + y *= r / rd; } - float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); - x *= r / rd; - y *= r / rd; } point[0] = depth * x; From ec6fbb5823801b9b2530156a17a2ddbd92a66bda Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Wed, 1 Jan 2025 10:46:07 +0200 Subject: [PATCH 04/10] forcing the code to go though the loop --- examples/measure/rs-measure.cpp | 2 +- src/rs.cpp | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp index 3de67dda88..f34fcbcf51 100644 --- a/examples/measure/rs-measure.cpp +++ b/examples/measure/rs-measure.cpp @@ -302,7 +302,7 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v) auto end = std::chrono::high_resolution_clock::now(); // Calculate the elapsed time in milliseconds - auto duration = std::chrono::duration_cast(end - start); + auto duration = std::chrono::duration_cast(end - start); // Output the elapsed time LOG_ERROR(duration.count()); diff --git a/src/rs.cpp b/src/rs.cpp index 0876d055ed..cabc5ef68e 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4113,9 +4113,10 @@ NOEXCEPT_RETURN(, pixel) /* Helper inner function (not part of the API) */ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { - return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && - abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && - abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); + return true; + //return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && + // abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && + //abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); } void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL @@ -4127,6 +4128,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i float xo = x; float yo = y; + if (!is_intrinsics_distortion_zero(intrin)) { @@ -4148,9 +4150,10 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) { + int i = 0; // need to loop until convergence // 10 iterations determined empirically - for (int i = 0; i < 10; i++) + for (; i < 10; i++) { float r2 = x * x + y * y; float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); @@ -4159,6 +4162,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i x = (xo - delta_x) * icdist; y = (yo - delta_y) * icdist; } + LOG_ERROR("i=" << i); } if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) @@ -4198,6 +4202,12 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i y *= r / rd; } } + else + { + + int i = 0; + LOG_ERROR("i=" << i); + } point[0] = depth * x; point[1] = depth * y; From fddfd7c250bef9f15021810bbdcac6cc802ebe60 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Mon, 6 Jan 2025 13:40:47 +0200 Subject: [PATCH 05/10] disable BUILD_WITH_CPU_EXTENSIONS SSE disable --- CMake/lrs_options.cmake | 2 +- src/image-avx.cpp | 2 +- src/image-avx.h | 2 +- src/proc/align.cpp | 11 +++++++++-- src/proc/color-formats-converter.cpp | 12 ++++++------ src/proc/pointcloud.cpp | 4 ++-- src/proc/sse/sse-align.cpp | 2 +- src/proc/sse/sse-align.h | 4 ++-- src/proc/sse/sse-pointcloud.cpp | 6 +++--- src/proc/y411-converter.cpp | 6 +++--- src/proc/y411-converter.h | 2 +- src/rs.cpp | 13 ++++++------- wrappers/opencv/depth-filter/downsample.cpp | 4 ++-- 13 files changed, 38 insertions(+), 32 deletions(-) diff --git a/CMake/lrs_options.cmake b/CMake/lrs_options.cmake index 94b25621ac..7cb07c905a 100644 --- a/CMake/lrs_options.cmake +++ b/CMake/lrs_options.cmake @@ -42,7 +42,7 @@ if (NOT APPLE) else() option(CHECK_FOR_UPDATES "Checks for versions updates" OFF) endif() -option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" ON) +option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" OFF) set(UNIT_TESTS_ARGS "" CACHE STRING "Command-line arguments to pass to unit-tests-config.py, e.g. '-t -r '") #Performance improvement with Ubuntu 18/20 if(UNIX AND (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)) diff --git a/src/image-avx.cpp b/src/image-avx.cpp index 012c099753..ba3640408a 100644 --- a/src/image-avx.cpp +++ b/src/image-avx.cpp @@ -6,7 +6,7 @@ #include "image-avx.h" #ifndef ANDROID - #if defined(__SSSE3__) && defined(__AVX2__) + #if defined(__SSE4__) && defined(__AVX2__) #include // For SSE3 intrinsic used in unpack_yuy2_sse #include diff --git a/src/image-avx.h b/src/image-avx.h index 9772517bf0..a162681ca3 100644 --- a/src/image-avx.h +++ b/src/image-avx.h @@ -10,7 +10,7 @@ namespace librealsense { #ifndef ANDROID - #if defined(__SSSE3__) && defined(__AVX2__) + #if defined(__SSE4__) && defined(__AVX2__) void unpack_yuy2_avx_y8(uint8_t * const d[], const uint8_t * s, int n); void unpack_yuy2_avx_y16(uint8_t * const d[], const uint8_t * s, int n); void unpack_yuy2_avx_rgb8(uint8_t * const d[], const uint8_t * s, int n); diff --git a/src/proc/align.cpp b/src/proc/align.cpp index a1910290cd..8d8441a525 100644 --- a/src/proc/align.cpp +++ b/src/proc/align.cpp @@ -13,7 +13,7 @@ #if defined(RS2_USE_CUDA) #include "proc/cuda/cuda-align.h" -#elif defined(__SSSE3__) +#elif defined(__SSE4__) #include "proc/sse/sse-align.h" #endif #include "proc/neon/neon-align.h" @@ -26,7 +26,7 @@ namespace librealsense { #if defined(RS2_USE_CUDA) return std::make_shared(align_to); - #elif defined(__SSSE3__) + #elif defined(__SSE4__) return std::make_shared(align_to); #elif defined(__ARM_NEON) && ! defined(ANDROID) return std::make_shared(align_to); @@ -51,7 +51,14 @@ namespace librealsense { // Map the top-left corner of the depth pixel onto the other image float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2]; + + auto start = std::chrono::high_resolution_clock::now(); rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth); + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + // Output the duration + std::cout << duration.count() << "\n"; + rs2_transform_point_to_point(other_point, &depth_to_other, depth_point); rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point); const int other_x0 = static_cast(other_pixel[0] + 0.5f); diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp index f0a54ba585..d80a4964e3 100644 --- a/src/proc/color-formats-converter.cpp +++ b/src/proc/color-formats-converter.cpp @@ -14,7 +14,7 @@ #ifdef RS2_USE_CUDA #include "cuda/cuda-conversion.cuh" #endif -#ifdef __SSSE3__ +#ifdef __SSE4__ #include // For SSSE3 intrinsics #endif #include "neon/image-neon.h" @@ -60,7 +60,7 @@ namespace librealsense rscuda::unpack_yuy2_cuda(d, s, n); return; #endif -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID static bool do_avx = has_avx(); #ifdef __AVX2__ @@ -477,7 +477,7 @@ namespace librealsense } } -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID // This method receives 1 line of y and one line of uv. // source_chunks_y // yyyyyyyyyyyyyyyy // source_chunks_uv // uvuvuvuvuvuvuvuv @@ -631,7 +631,7 @@ namespace librealsense auto n = width * height; assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration. -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID static bool do_avx = has_avx(); auto src = reinterpret_cast(s); @@ -753,7 +753,7 @@ namespace librealsense m420_parse_one_line(start_of_second_line, start_of_uv, &dst, width); } return; -#endif // __SSSE3__ +#endif // __SSE4__ } void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, uint8_t * const d[], const uint8_t * s, int w, int h, int actual_size) @@ -822,7 +822,7 @@ namespace librealsense { auto n = width * height; assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration. -#ifdef __SSSE3__ +#ifdef __SSE4__ auto src = reinterpret_cast(s); auto dst = reinterpret_cast<__m128i *>(d[0]); for (; n; n -= 16) diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp index 5b5db2724a..9712c856a2 100644 --- a/src/proc/pointcloud.cpp +++ b/src/proc/pointcloud.cpp @@ -19,7 +19,7 @@ #ifdef RS2_USE_CUDA #include "proc/cuda/cuda-pointcloud.h" #endif -#ifdef __SSSE3__ +#ifdef __SSE4__ #include "proc/sse/sse-pointcloud.h" #endif #include "proc/neon/neon-pointcloud.h" @@ -397,7 +397,7 @@ namespace librealsense { #ifdef RS2_USE_CUDA return std::make_shared(); - #elif defined(__SSSE3__) + #elif defined(__SSE4__) return std::make_shared(); #elif defined(__ARM_NEON) && ! defined ANDROID return std::make_shared(); diff --git a/src/proc/sse/sse-align.cpp b/src/proc/sse/sse-align.cpp index 904cb8df5c..42b6adeb4a 100644 --- a/src/proc/sse/sse-align.cpp +++ b/src/proc/sse/sse-align.cpp @@ -1,6 +1,6 @@ // License: Apache 2.0. See LICENSE file in root directory. // Copyright(c) 2019 Intel Corporation. All Rights Reserved. -#ifdef __SSSE3__ +#ifdef __SSE4__ #include "sse-align.h" #include // For SSE3 intrinsic used in unpack_yuy2_sse diff --git a/src/proc/sse/sse-align.h b/src/proc/sse/sse-align.h index 133dcde362..d1b4e3107c 100644 --- a/src/proc/sse/sse-align.h +++ b/src/proc/sse/sse-align.h @@ -1,7 +1,7 @@ // License: Apache 2.0. See LICENSE file in root directory. // Copyright(c) 2024 Intel Corporation. All Rights Reserved. #pragma once -#ifdef __SSSE3__ +#ifdef __SSE4__ #include "proc/align.h" #include @@ -87,4 +87,4 @@ namespace librealsense std::shared_ptr _stream_transform; }; } -#endif // __SSSE3__ +#endif // __SSE4__ diff --git a/src/proc/sse/sse-pointcloud.cpp b/src/proc/sse/sse-pointcloud.cpp index f2f6e63e9e..5aa658261d 100644 --- a/src/proc/sse/sse-pointcloud.cpp +++ b/src/proc/sse/sse-pointcloud.cpp @@ -11,7 +11,7 @@ #include -#ifdef __SSSE3__ +#ifdef __SSE4__ #include // For SSSE3 intrinsics @@ -56,7 +56,7 @@ namespace librealsense const rs2_intrinsics &depth_intrinsics, const rs2::depth_frame& depth_frame) { -#ifdef __SSSE3__ +#ifdef __SSE4__ auto depth_image = (const uint16_t*)depth_frame.get_data(); @@ -145,7 +145,7 @@ namespace librealsense { auto tex_ptr = texture_map; -#ifdef __SSSE3__ +#ifdef __SSE4__ auto point = reinterpret_cast(points); auto res = reinterpret_cast(tex_ptr); auto res1 = reinterpret_cast(pixels_ptr); diff --git a/src/proc/y411-converter.cpp b/src/proc/y411-converter.cpp index 725a9149bc..1ace0b1035 100644 --- a/src/proc/y411-converter.cpp +++ b/src/proc/y411-converter.cpp @@ -6,7 +6,7 @@ #ifdef RS2_USE_CUDA #include "cuda/cuda-conversion.cuh" #endif -#ifdef __SSSE3__ +#ifdef __SSE4__ #include // For SSSE3 intrinsics #endif @@ -44,7 +44,7 @@ namespace librealsense // See https://www.fourcc.org/pixel-format/yuv-y411/ // -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size) { auto n = w * h; @@ -297,7 +297,7 @@ namespace librealsense // The size of the frame must be bigger than 4 pixels and product of 32 void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size ) { -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID unpack_y411_sse(dest[0], s, w, h, actual_size); #else unpack_y411_native(dest[0], s, w, h, actual_size); diff --git a/src/proc/y411-converter.h b/src/proc/y411-converter.h index 0c793f74a0..630b34a8d3 100644 --- a/src/proc/y411-converter.h +++ b/src/proc/y411-converter.h @@ -24,7 +24,7 @@ namespace librealsense void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size); -#if defined __SSSE3__ && ! defined ANDROID +#if defined __SSE4__ && ! defined ANDROID void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size); #endif diff --git a/src/rs.cpp b/src/rs.cpp index cabc5ef68e..0bb1b1d144 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4113,10 +4113,10 @@ NOEXCEPT_RETURN(, pixel) /* Helper inner function (not part of the API) */ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { - return true; - //return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && - // abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && - //abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); + //return false; + return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && + abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && + abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); } void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL @@ -4162,7 +4162,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i x = (xo - delta_x) * icdist; y = (yo - delta_y) * icdist; } - LOG_ERROR("i=" << i); } if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) @@ -4205,8 +4204,8 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i else { - int i = 0; - LOG_ERROR("i=" << i); + //int i = 0; + //LOG_ERROR("i=" << i); } point[0] = depth * x; diff --git a/wrappers/opencv/depth-filter/downsample.cpp b/wrappers/opencv/depth-filter/downsample.cpp index c4938bf2f2..9b158ccca8 100644 --- a/wrappers/opencv/depth-filter/downsample.cpp +++ b/wrappers/opencv/depth-filter/downsample.cpp @@ -5,7 +5,7 @@ #include -#ifdef __SSSE3__ +#ifdef __SSE4__ #include #include #endif @@ -24,7 +24,7 @@ void downsample_min_4x4(const cv::Mat& source, cv::Mat* pDest) const size_t sizeYresized = source.rows / DOWNSAMPLE_FACTOR; -#ifdef __SSSE3__ +#ifdef __SSE4__ __m128i ones = _mm_set1_epi16(1); // Note on multi-threading here, 2018-08-17 From d26ec5c4cc9b2a20c3e49c2e89660690a0a67580 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 7 Jan 2025 09:17:05 +0200 Subject: [PATCH 06/10] clean code --- src/proc/align.cpp | 12 ++++++----- src/rs.cpp | 50 ++++++++++++++++++++-------------------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/src/proc/align.cpp b/src/proc/align.cpp index 8d8441a525..aa78411a64 100644 --- a/src/proc/align.cpp +++ b/src/proc/align.cpp @@ -39,6 +39,7 @@ namespace librealsense void align_images(const rs2_intrinsics& depth_intrin, const rs2_extrinsics& depth_to_other, const rs2_intrinsics& other_intrin, GET_DEPTH get_depth, TRANSFER_PIXEL transfer_pixel) { + auto start = std::chrono::high_resolution_clock::now(); // Iterate over the pixels of the depth image #pragma omp parallel for schedule(dynamic) for (int depth_y = 0; depth_y < depth_intrin.height; ++depth_y) @@ -52,12 +53,9 @@ namespace librealsense // Map the top-left corner of the depth pixel onto the other image float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2]; - auto start = std::chrono::high_resolution_clock::now(); + rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth); - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - // Output the duration - std::cout << duration.count() << "\n"; + rs2_transform_point_to_point(other_point, &depth_to_other, depth_point); rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point); @@ -86,6 +84,10 @@ namespace librealsense } } } + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + // Output the duration + std::cout << duration.count() << "\n"; } align::align(rs2_stream to_stream) : align(to_stream, "Align") diff --git a/src/rs.cpp b/src/rs.cpp index 0bb1b1d144..8fcead23d8 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4164,31 +4164,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } } - if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) - { - float rd = sqrtf(x * x + y * y); - if (rd < FLT_EPSILON) - { - rd = FLT_EPSILON; - } - - float theta = rd; - float theta2 = rd * rd; - for (int i = 0; i < 4; i++) - { - float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd; - if (fabs(f) < FLT_EPSILON) - { - break; - } - float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3]))); - theta -= f / df; - theta2 = theta * theta; - } - float r = tan(theta); - x *= r / rd; - y *= r / rd; - } if (intrin->model == RS2_DISTORTION_FTHETA) { float rd = sqrtf(x * x + y * y); @@ -4201,11 +4176,30 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i y *= r / rd; } } - else + if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) { + float rd = sqrtf(x * x + y * y); + if (rd < FLT_EPSILON) + { + rd = FLT_EPSILON; + } - //int i = 0; - //LOG_ERROR("i=" << i); + float theta = rd; + float theta2 = rd * rd; + for (int i = 0; i < 4; i++) + { + float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd; + if (fabs(f) < FLT_EPSILON) + { + break; + } + float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3]))); + theta -= f / df; + theta2 = theta * theta; + } + float r = tan(theta); + x *= r / rd; + y *= r / rd; } point[0] = depth * x; From e338df276f8c5c250a41c5c4a19f99e74f6e32b0 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 7 Jan 2025 09:43:23 +0200 Subject: [PATCH 07/10] avoid loop in case intrinsic coefficient are zero --- CMake/lrs_options.cmake | 2 +- examples/measure/rs-measure.cpp | 17 ----------------- src/image-avx.cpp | 2 +- src/image-avx.h | 2 +- src/proc/align.cpp | 13 ++----------- src/proc/color-formats-converter.cpp | 12 ++++++------ src/proc/pointcloud.cpp | 4 ++-- src/proc/sse/sse-align.cpp | 2 +- src/proc/sse/sse-align.h | 4 ++-- src/proc/sse/sse-pointcloud.cpp | 6 +++--- src/proc/y411-converter.cpp | 6 +++--- src/proc/y411-converter.h | 2 +- src/rs.cpp | 18 +----------------- wrappers/opencv/depth-filter/downsample.cpp | 4 ++-- 14 files changed, 26 insertions(+), 68 deletions(-) diff --git a/CMake/lrs_options.cmake b/CMake/lrs_options.cmake index 7cb07c905a..94b25621ac 100644 --- a/CMake/lrs_options.cmake +++ b/CMake/lrs_options.cmake @@ -42,7 +42,7 @@ if (NOT APPLE) else() option(CHECK_FOR_UPDATES "Checks for versions updates" OFF) endif() -option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" OFF) +option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" ON) set(UNIT_TESTS_ARGS "" CACHE STRING "Command-line arguments to pass to unit-tests-config.py, e.g. '-t -r '") #Performance improvement with Ubuntu 18/20 if(UNIX AND (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED)) diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp index f34fcbcf51..9109c82fd5 100644 --- a/examples/measure/rs-measure.cpp +++ b/examples/measure/rs-measure.cpp @@ -15,7 +15,6 @@ #include #include #include -#include using pixel = std::pair; @@ -101,7 +100,6 @@ void render_simple_distance(const rs2::depth_frame& depth, int main(int argc, char * argv[]) try { - rs2::log_to_console(RS2_LOG_SEVERITY_ERROR); auto settings = rs2::cli( "rs-measure example" ) .process( argc, argv ); @@ -293,24 +291,9 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v) // Deproject from pixel to point in 3D rs2_intrinsics intr = frame.get_profile().as().get_intrinsics(); // Calibration data - // Get the starting time point - auto start = std::chrono::high_resolution_clock::now(); - rs2_deproject_pixel_to_point(upoint, &intr, upixel, udist); - - // Get the ending time point - auto end = std::chrono::high_resolution_clock::now(); - - // Calculate the elapsed time in milliseconds - auto duration = std::chrono::duration_cast(end - start); - - // Output the elapsed time - LOG_ERROR(duration.count()); - rs2_deproject_pixel_to_point(vpoint, &intr, vpixel, vdist); - - // Calculate euclidean distance between the two points return sqrt(pow(upoint[0] - vpoint[0], 2.f) + pow(upoint[1] - vpoint[1], 2.f) + diff --git a/src/image-avx.cpp b/src/image-avx.cpp index ba3640408a..012c099753 100644 --- a/src/image-avx.cpp +++ b/src/image-avx.cpp @@ -6,7 +6,7 @@ #include "image-avx.h" #ifndef ANDROID - #if defined(__SSE4__) && defined(__AVX2__) + #if defined(__SSSE3__) && defined(__AVX2__) #include // For SSE3 intrinsic used in unpack_yuy2_sse #include diff --git a/src/image-avx.h b/src/image-avx.h index a162681ca3..9772517bf0 100644 --- a/src/image-avx.h +++ b/src/image-avx.h @@ -10,7 +10,7 @@ namespace librealsense { #ifndef ANDROID - #if defined(__SSE4__) && defined(__AVX2__) + #if defined(__SSSE3__) && defined(__AVX2__) void unpack_yuy2_avx_y8(uint8_t * const d[], const uint8_t * s, int n); void unpack_yuy2_avx_y16(uint8_t * const d[], const uint8_t * s, int n); void unpack_yuy2_avx_rgb8(uint8_t * const d[], const uint8_t * s, int n); diff --git a/src/proc/align.cpp b/src/proc/align.cpp index aa78411a64..a1910290cd 100644 --- a/src/proc/align.cpp +++ b/src/proc/align.cpp @@ -13,7 +13,7 @@ #if defined(RS2_USE_CUDA) #include "proc/cuda/cuda-align.h" -#elif defined(__SSE4__) +#elif defined(__SSSE3__) #include "proc/sse/sse-align.h" #endif #include "proc/neon/neon-align.h" @@ -26,7 +26,7 @@ namespace librealsense { #if defined(RS2_USE_CUDA) return std::make_shared(align_to); - #elif defined(__SSE4__) + #elif defined(__SSSE3__) return std::make_shared(align_to); #elif defined(__ARM_NEON) && ! defined(ANDROID) return std::make_shared(align_to); @@ -39,7 +39,6 @@ namespace librealsense void align_images(const rs2_intrinsics& depth_intrin, const rs2_extrinsics& depth_to_other, const rs2_intrinsics& other_intrin, GET_DEPTH get_depth, TRANSFER_PIXEL transfer_pixel) { - auto start = std::chrono::high_resolution_clock::now(); // Iterate over the pixels of the depth image #pragma omp parallel for schedule(dynamic) for (int depth_y = 0; depth_y < depth_intrin.height; ++depth_y) @@ -52,11 +51,7 @@ namespace librealsense { // Map the top-left corner of the depth pixel onto the other image float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2]; - - rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth); - - rs2_transform_point_to_point(other_point, &depth_to_other, depth_point); rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point); const int other_x0 = static_cast(other_pixel[0] + 0.5f); @@ -84,10 +79,6 @@ namespace librealsense } } } - auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - // Output the duration - std::cout << duration.count() << "\n"; } align::align(rs2_stream to_stream) : align(to_stream, "Align") diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp index d80a4964e3..f0a54ba585 100644 --- a/src/proc/color-formats-converter.cpp +++ b/src/proc/color-formats-converter.cpp @@ -14,7 +14,7 @@ #ifdef RS2_USE_CUDA #include "cuda/cuda-conversion.cuh" #endif -#ifdef __SSE4__ +#ifdef __SSSE3__ #include // For SSSE3 intrinsics #endif #include "neon/image-neon.h" @@ -60,7 +60,7 @@ namespace librealsense rscuda::unpack_yuy2_cuda(d, s, n); return; #endif -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID static bool do_avx = has_avx(); #ifdef __AVX2__ @@ -477,7 +477,7 @@ namespace librealsense } } -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID // This method receives 1 line of y and one line of uv. // source_chunks_y // yyyyyyyyyyyyyyyy // source_chunks_uv // uvuvuvuvuvuvuvuv @@ -631,7 +631,7 @@ namespace librealsense auto n = width * height; assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration. -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID static bool do_avx = has_avx(); auto src = reinterpret_cast(s); @@ -753,7 +753,7 @@ namespace librealsense m420_parse_one_line(start_of_second_line, start_of_uv, &dst, width); } return; -#endif // __SSE4__ +#endif // __SSSE3__ } void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, uint8_t * const d[], const uint8_t * s, int w, int h, int actual_size) @@ -822,7 +822,7 @@ namespace librealsense { auto n = width * height; assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration. -#ifdef __SSE4__ +#ifdef __SSSE3__ auto src = reinterpret_cast(s); auto dst = reinterpret_cast<__m128i *>(d[0]); for (; n; n -= 16) diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp index 9712c856a2..5b5db2724a 100644 --- a/src/proc/pointcloud.cpp +++ b/src/proc/pointcloud.cpp @@ -19,7 +19,7 @@ #ifdef RS2_USE_CUDA #include "proc/cuda/cuda-pointcloud.h" #endif -#ifdef __SSE4__ +#ifdef __SSSE3__ #include "proc/sse/sse-pointcloud.h" #endif #include "proc/neon/neon-pointcloud.h" @@ -397,7 +397,7 @@ namespace librealsense { #ifdef RS2_USE_CUDA return std::make_shared(); - #elif defined(__SSE4__) + #elif defined(__SSSE3__) return std::make_shared(); #elif defined(__ARM_NEON) && ! defined ANDROID return std::make_shared(); diff --git a/src/proc/sse/sse-align.cpp b/src/proc/sse/sse-align.cpp index 42b6adeb4a..904cb8df5c 100644 --- a/src/proc/sse/sse-align.cpp +++ b/src/proc/sse/sse-align.cpp @@ -1,6 +1,6 @@ // License: Apache 2.0. See LICENSE file in root directory. // Copyright(c) 2019 Intel Corporation. All Rights Reserved. -#ifdef __SSE4__ +#ifdef __SSSE3__ #include "sse-align.h" #include // For SSE3 intrinsic used in unpack_yuy2_sse diff --git a/src/proc/sse/sse-align.h b/src/proc/sse/sse-align.h index d1b4e3107c..133dcde362 100644 --- a/src/proc/sse/sse-align.h +++ b/src/proc/sse/sse-align.h @@ -1,7 +1,7 @@ // License: Apache 2.0. See LICENSE file in root directory. // Copyright(c) 2024 Intel Corporation. All Rights Reserved. #pragma once -#ifdef __SSE4__ +#ifdef __SSSE3__ #include "proc/align.h" #include @@ -87,4 +87,4 @@ namespace librealsense std::shared_ptr _stream_transform; }; } -#endif // __SSE4__ +#endif // __SSSE3__ diff --git a/src/proc/sse/sse-pointcloud.cpp b/src/proc/sse/sse-pointcloud.cpp index 5aa658261d..f2f6e63e9e 100644 --- a/src/proc/sse/sse-pointcloud.cpp +++ b/src/proc/sse/sse-pointcloud.cpp @@ -11,7 +11,7 @@ #include -#ifdef __SSE4__ +#ifdef __SSSE3__ #include // For SSSE3 intrinsics @@ -56,7 +56,7 @@ namespace librealsense const rs2_intrinsics &depth_intrinsics, const rs2::depth_frame& depth_frame) { -#ifdef __SSE4__ +#ifdef __SSSE3__ auto depth_image = (const uint16_t*)depth_frame.get_data(); @@ -145,7 +145,7 @@ namespace librealsense { auto tex_ptr = texture_map; -#ifdef __SSE4__ +#ifdef __SSSE3__ auto point = reinterpret_cast(points); auto res = reinterpret_cast(tex_ptr); auto res1 = reinterpret_cast(pixels_ptr); diff --git a/src/proc/y411-converter.cpp b/src/proc/y411-converter.cpp index 1ace0b1035..725a9149bc 100644 --- a/src/proc/y411-converter.cpp +++ b/src/proc/y411-converter.cpp @@ -6,7 +6,7 @@ #ifdef RS2_USE_CUDA #include "cuda/cuda-conversion.cuh" #endif -#ifdef __SSE4__ +#ifdef __SSSE3__ #include // For SSSE3 intrinsics #endif @@ -44,7 +44,7 @@ namespace librealsense // See https://www.fourcc.org/pixel-format/yuv-y411/ // -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size) { auto n = w * h; @@ -297,7 +297,7 @@ namespace librealsense // The size of the frame must be bigger than 4 pixels and product of 32 void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size ) { -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID unpack_y411_sse(dest[0], s, w, h, actual_size); #else unpack_y411_native(dest[0], s, w, h, actual_size); diff --git a/src/proc/y411-converter.h b/src/proc/y411-converter.h index 630b34a8d3..0c793f74a0 100644 --- a/src/proc/y411-converter.h +++ b/src/proc/y411-converter.h @@ -24,7 +24,7 @@ namespace librealsense void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size); -#if defined __SSE4__ && ! defined ANDROID +#if defined __SSSE3__ && ! defined ANDROID void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size); #endif diff --git a/src/rs.cpp b/src/rs.cpp index 8fcead23d8..07645f7927 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -64,11 +64,6 @@ #include #include -#include -#include -#include // For std::this_thread::sleep_for -#include // For random number generation - //////////////////////// // API implementation // //////////////////////// @@ -4113,7 +4108,6 @@ NOEXCEPT_RETURN(, pixel) /* Helper inner function (not part of the API) */ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { - //return false; return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); @@ -4150,10 +4144,9 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } if (intrin->model == RS2_DISTORTION_BROWN_CONRADY) { - int i = 0; // need to loop until convergence // 10 iterations determined empirically - for (; i < 10; i++) + for (int i = 0; i < 10; i++) { float r2 = x * x + y * y; float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2); @@ -4205,15 +4198,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i point[0] = depth * x; point[1] = depth * y; point[2] = depth; - - // Get the ending time point - //auto end = std::chrono::high_resolution_clock::now(); - - // Calculate the elapsed time in milliseconds - //auto duration = std::chrono::duration_cast(end - start); - - // Output the elapsed time - //std::cout << duration.count() << std::endl; } NOEXCEPT_RETURN(, point) diff --git a/wrappers/opencv/depth-filter/downsample.cpp b/wrappers/opencv/depth-filter/downsample.cpp index 9b158ccca8..c4938bf2f2 100644 --- a/wrappers/opencv/depth-filter/downsample.cpp +++ b/wrappers/opencv/depth-filter/downsample.cpp @@ -5,7 +5,7 @@ #include -#ifdef __SSE4__ +#ifdef __SSSE3__ #include #include #endif @@ -24,7 +24,7 @@ void downsample_min_4x4(const cv::Mat& source, cv::Mat* pDest) const size_t sizeYresized = source.rows / DOWNSAMPLE_FACTOR; -#ifdef __SSE4__ +#ifdef __SSSE3__ __m128i ones = _mm_set1_epi16(1); // Note on multi-threading here, 2018-08-17 From 5e1a37d06a7f30e2aee9bc711354f52162b2d6cd Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 7 Jan 2025 10:55:38 +0200 Subject: [PATCH 08/10] in case coeff[0] is zero, avoid division by zero --- src/rs.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/rs.cpp b/src/rs.cpp index 07645f7927..d28c099be3 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4157,17 +4157,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i } } - if (intrin->model == RS2_DISTORTION_FTHETA) - { - float rd = sqrtf(x * x + y * y); - if (rd < FLT_EPSILON) - { - rd = FLT_EPSILON; - } - float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); - x *= r / rd; - y *= r / rd; - } } if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4) { @@ -4194,6 +4183,17 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i x *= r / rd; y *= r / rd; } + if (intrin->model == RS2_DISTORTION_FTHETA) + { + float rd = sqrtf(x * x + y * y); + if (rd < FLT_EPSILON) + { + rd = FLT_EPSILON; + } + float r = (intrin->coeffs[0] < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); + x *= r / rd; + y *= r / rd; + } point[0] = depth * x; point[1] = depth * y; From 2308519eeeb988b7a69943337c26c3000968f7b8 Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 7 Jan 2025 13:34:32 +0200 Subject: [PATCH 09/10] add fabs to protect from negative numbers --- src/rs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rs.cpp b/src/rs.cpp index d28c099be3..84898d51c1 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4190,7 +4190,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i { rd = FLT_EPSILON; } - float r = (intrin->coeffs[0] < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); + float r = (fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); x *= r / rd; y *= r / rd; } From e2a18b223c1956d18c95690db736f62ffdfebfab Mon Sep 17 00:00:00 2001 From: gilpazintel Date: Tue, 7 Jan 2025 18:49:13 +0200 Subject: [PATCH 10/10] changed abs to std::fabs, also changed double numeric limit to float numeric limit --- src/rs.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/rs.cpp b/src/rs.cpp index 84898d51c1..d274bdcd7e 100644 --- a/src/rs.cpp +++ b/src/rs.cpp @@ -4108,9 +4108,9 @@ NOEXCEPT_RETURN(, pixel) /* Helper inner function (not part of the API) */ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin) { - return (abs(intrin->coeffs[0]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits::epsilon() && - abs(intrin->coeffs[2]) < std::numeric_limits::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits::epsilon() && - abs(intrin->coeffs[4]) < std::numeric_limits::epsilon()); + return (std::fabs(intrin->coeffs[0]) < FLT_EPSILON && std::fabs(intrin->coeffs[1]) < FLT_EPSILON && + std::fabs(intrin->coeffs[2]) < FLT_EPSILON && std::fabs(intrin->coeffs[3]) < FLT_EPSILON && + std::fabs(intrin->coeffs[4]) < FLT_EPSILON); } void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL @@ -4171,7 +4171,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i for (int i = 0; i < 4; i++) { float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd; - if (fabs(f) < FLT_EPSILON) + if (std::fabs(f) < FLT_EPSILON) { break; } @@ -4190,7 +4190,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i { rd = FLT_EPSILON; } - float r = (fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); + float r = (std::fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f))); x *= r / rd; y *= r / rd; }