From 210c2e9b8a080df99c0095291f17ecd29bd0b592 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 24 Dec 2024 13:38:39 +0200
Subject: [PATCH 01/10] ff

---
 src/rs.cpp | 93 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 67 insertions(+), 26 deletions(-)
diff --git a/src/rs.cpp b/src/rs.cpp
index 6add8361f1..67a1756677 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -64,6 +64,10 @@
 #include <src/core/time-service.h>
 #include <rsutils/string/from.h>
 
+#include <iostream>
+#include <chrono>
+#include <thread> // For std::this_thread::sleep_for
+
 ////////////////////////
 // API implementation //
 ////////////////////////
@@ -4106,6 +4110,19 @@ void rs2_project_point_to_pixel(float pixel[2], const struct rs2_intrinsics* int
 }
 NOEXCEPT_RETURN(, pixel)
 
+/* return true in case all intrinsics distortion coefficient are zero.
+   otherwise, return false.*/
+const int DISTORTION_COEFF = 5;
+inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
+{
+    for(int i = 0;i < DISTORTION_COEFF;i++)
+    {
+        if (abs(intrin->coeffs[i]) > std::numeric_limits<double>::epsilon())
+            return false;
+    }
+    return true;
+}
+
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
 {
     assert(intrin->model != RS2_DISTORTION_MODIFIED_BROWN_CONRADY); // Cannot deproject from a forward-distorted image
@@ -4117,39 +4134,53 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     float xo = x;
     float yo = y;
 
-    if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY)
+    // Get the starting time point
+    auto start = std::chrono::high_resolution_clock::now();
+
+    std::cout << "rs2_deproject_pixel_to_point start" << std::endl;
+
+    if(!is_intrinsics_distortion_zero(intrin))
     {
-        // need to loop until convergence 
-        // 10 iterations determined empirically
-        for (int i = 0; i < 10; i++)
+        std::cout << "rs2_deproject_pixel_to_point 1" << std::endl;
+         
+
+        if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY)
         {
-            float r2 = x * x + y * y;
-            float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
-            float xq = x / icdist;
-            float yq = y / icdist;
-            float delta_x = 2 * intrin->coeffs[2] * xq * yq + intrin->coeffs[3] * (r2 + 2 * xq * xq);
-            float delta_y = 2 * intrin->coeffs[3] * xq * yq + intrin->coeffs[2] * (r2 + 2 * yq * yq);
-            x = (xo - delta_x) * icdist;
-            y = (yo - delta_y) * icdist;
+            std::cout << "rs2_deproject_pixel_to_point 2" << std::endl;
+            // need to loop until convergence 
+            // 10 iterations determined empirically
+            for (int i = 0; i < 10; i++)
+            {
+                float r2 = x * x + y * y;
+                float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
+                float xq = x / icdist;
+                float yq = y / icdist;
+                float delta_x = 2 * intrin->coeffs[2] * xq * yq + intrin->coeffs[3] * (r2 + 2 * xq * xq);
+                float delta_y = 2 * intrin->coeffs[3] * xq * yq + intrin->coeffs[2] * (r2 + 2 * yq * yq);
+                x = (xo - delta_x) * icdist;
+                y = (yo - delta_y) * icdist;
+            }
+
         }
-    }
-    if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
-    {
-        // need to loop until convergence 
-        // 10 iterations determined empirically
-        for (int i = 0; i < 10; i++)
+        if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
-            float r2 = x * x + y * y;
-            float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
-            float delta_x = 2 * intrin->coeffs[2] * x * y + intrin->coeffs[3] * (r2 + 2 * x * x);
-            float delta_y = 2 * intrin->coeffs[3] * x * y + intrin->coeffs[2] * (r2 + 2 * y * y);
-            x = (xo - delta_x) * icdist;
-            y = (yo - delta_y) * icdist;
+            std::cout << "rs2_deproject_pixel_to_point 3" << std::endl;
+            // need to loop until convergence 
+            // 10 iterations determined empirically
+            for (int i = 0; i < 10; i++)
+            {
+                float r2 = x * x + y * y;
+                float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
+                float delta_x = 2 * intrin->coeffs[2] * x * y + intrin->coeffs[3] * (r2 + 2 * x * x);
+                float delta_y = 2 * intrin->coeffs[3] * x * y + intrin->coeffs[2] * (r2 + 2 * y * y);
+                x = (xo - delta_x) * icdist;
+                y = (yo - delta_y) * icdist;
+            }
         }
-
-    }
+    }    
     if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
     {
+        std::cout << "rs2_deproject_pixel_to_point 4" << std::endl;
         float rd = sqrtf(x * x + y * y);
         if (rd < FLT_EPSILON)
         {
@@ -4175,6 +4206,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     }
     if (intrin->model == RS2_DISTORTION_FTHETA)
     {
+        std::cout << "rs2_deproject_pixel_to_point 5" << std::endl;
         float rd = sqrtf(x * x + y * y);
         if (rd < FLT_EPSILON)
         {
@@ -4188,6 +4220,15 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     point[0] = depth * x;
     point[1] = depth * y;
     point[2] = depth;
+
+    // Get the ending time point
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // Calculate the elapsed time in milliseconds
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+
+    // Output the elapsed time
+    std::cout << "rs2_deproject_pixel_to_point Elapsed time: " << duration.count() << " milliseconds" << std::endl;
 }
 NOEXCEPT_RETURN(, point)
 

From 89b95f87ec5a98159492e67559de2726fea5e065 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 24 Dec 2024 16:04:20 +0200
Subject: [PATCH 02/10] before trying the exact code of the submitter

---
 examples/measure/rs-measure.cpp | 17 ++++++++++++++
 src/rs.cpp                      | 39 +++++++++++++++++++++------------
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp
index 9109c82fd5..3de67dda88 100644
--- a/examples/measure/rs-measure.cpp
+++ b/examples/measure/rs-measure.cpp
@@ -15,6 +15,7 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
+#include <rsutils/easylogging/easyloggingpp.h>
 
 using pixel = std::pair<int, int>;
 
@@ -100,6 +101,7 @@ void render_simple_distance(const rs2::depth_frame& depth,
 
 int main(int argc, char * argv[]) try
 {
+    rs2::log_to_console(RS2_LOG_SEVERITY_ERROR);
     auto settings = rs2::cli( "rs-measure example" )
         .process( argc, argv );
 
@@ -291,9 +293,24 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v)
 
     // Deproject from pixel to point in 3D
     rs2_intrinsics intr = frame.get_profile().as<rs2::video_stream_profile>().get_intrinsics(); // Calibration data
+    // Get the starting time point
+    auto start = std::chrono::high_resolution_clock::now();
+
     rs2_deproject_pixel_to_point(upoint, &intr, upixel, udist);
+
+    // Get the ending time point
+    auto end = std::chrono::high_resolution_clock::now();
+
+    // Calculate the elapsed time in milliseconds
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+
+    // Output the elapsed time
+    LOG_ERROR(duration.count());
+
     rs2_deproject_pixel_to_point(vpoint, &intr, vpixel, vdist);
 
+    
+
     // Calculate euclidean distance between the two points
     return sqrt(pow(upoint[0] - vpoint[0], 2.f) +
                 pow(upoint[1] - vpoint[1], 2.f) +
diff --git a/src/rs.cpp b/src/rs.cpp
index 67a1756677..f48e061620 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -67,6 +67,7 @@
 #include <iostream>
 #include <chrono>
 #include <thread> // For std::this_thread::sleep_for
+#include <random>  // For random number generation
 
 ////////////////////////
 // API implementation //
@@ -4113,14 +4114,22 @@ NOEXCEPT_RETURN(, pixel)
 /* return true in case all intrinsics distortion coefficient are zero.
    otherwise, return false.*/
 const int DISTORTION_COEFF = 5;
-inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
+/*inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
     for(int i = 0;i < DISTORTION_COEFF;i++)
     {
         if (abs(intrin->coeffs[i]) > std::numeric_limits<double>::epsilon())
             return false;
     }
-    return true;
+    //return true;
+    return false;
+}*/
+
+inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
+{
+    return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
+        abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
+        abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
 }
 
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
@@ -4135,18 +4144,23 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     float yo = y;
 
     // Get the starting time point
-    auto start = std::chrono::high_resolution_clock::now();
+    //auto start = std::chrono::high_resolution_clock::now();
 
-    std::cout << "rs2_deproject_pixel_to_point start" << std::endl;
+    // Use const_cast to remove the const qualifier
+    //rs2_intrinsics* non_const_intrin = const_cast<rs2_intrinsics*>(intrin);
+
+    //non_const_intrin->coeffs[0] = 0.01;
+    //non_const_intrin->coeffs[1] = 0.02;
+    //non_const_intrin->coeffs[2] = 0.03;
+    //non_const_intrin->coeffs[3] = 0.04;
+    //non_const_intrin->coeffs[4] = 0.05;
+        
 
     if(!is_intrinsics_distortion_zero(intrin))
-    {
-        std::cout << "rs2_deproject_pixel_to_point 1" << std::endl;
-         
+    {        
 
         if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY)
         {
-            std::cout << "rs2_deproject_pixel_to_point 2" << std::endl;
             // need to loop until convergence 
             // 10 iterations determined empirically
             for (int i = 0; i < 10; i++)
@@ -4164,7 +4178,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         }
         if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
-            std::cout << "rs2_deproject_pixel_to_point 3" << std::endl;
             // need to loop until convergence 
             // 10 iterations determined empirically
             for (int i = 0; i < 10; i++)
@@ -4180,7 +4193,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     }    
     if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
     {
-        std::cout << "rs2_deproject_pixel_to_point 4" << std::endl;
         float rd = sqrtf(x * x + y * y);
         if (rd < FLT_EPSILON)
         {
@@ -4206,7 +4218,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     }
     if (intrin->model == RS2_DISTORTION_FTHETA)
     {
-        std::cout << "rs2_deproject_pixel_to_point 5" << std::endl;
         float rd = sqrtf(x * x + y * y);
         if (rd < FLT_EPSILON)
         {
@@ -4222,13 +4233,13 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     point[2] = depth;
 
     // Get the ending time point
-    auto end = std::chrono::high_resolution_clock::now();
+    //auto end = std::chrono::high_resolution_clock::now();
 
     // Calculate the elapsed time in milliseconds
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+    //auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
 
     // Output the elapsed time
-    std::cout << "rs2_deproject_pixel_to_point Elapsed time: " << duration.count() << " milliseconds" << std::endl;
+    //std::cout << duration.count()  << std::endl;
 }
 NOEXCEPT_RETURN(, point)
 

From 4183a3f4cb9246e3cb0adce8a901ef15cfa0b36b Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 24 Dec 2024 16:43:29 +0200
Subject: [PATCH 03/10] after applying the exact code from Wen - still I do not
 see an improvement rather a decrease in performance.

---
 src/rs.cpp | 101 +++++++++++++++++++----------------------------------
 1 file changed, 36 insertions(+), 65 deletions(-)

diff --git a/src/rs.cpp b/src/rs.cpp
index f48e061620..0876d055ed 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4110,21 +4110,7 @@ void rs2_project_point_to_pixel(float pixel[2], const struct rs2_intrinsics* int
     pixel[1] = y * intrin->fy + intrin->ppy;
 }
 NOEXCEPT_RETURN(, pixel)
-
-/* return true in case all intrinsics distortion coefficient are zero.
-   otherwise, return false.*/
-const int DISTORTION_COEFF = 5;
-/*inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
-{
-    for(int i = 0;i < DISTORTION_COEFF;i++)
-    {
-        if (abs(intrin->coeffs[i]) > std::numeric_limits<double>::epsilon())
-            return false;
-    }
-    //return true;
-    return false;
-}*/
-
+/* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
     return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
@@ -4135,33 +4121,18 @@ inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
 {
     assert(intrin->model != RS2_DISTORTION_MODIFIED_BROWN_CONRADY); // Cannot deproject from a forward-distorted image
-    //assert(intrin->model != RS2_DISTORTION_BROWN_CONRADY); // Cannot deproject to an brown conrady model
-
+ 
     float x = (pixel[0] - intrin->ppx) / intrin->fx;
     float y = (pixel[1] - intrin->ppy) / intrin->fy;
 
     float xo = x;
     float yo = y;
 
-    // Get the starting time point
-    //auto start = std::chrono::high_resolution_clock::now();
-
-    // Use const_cast to remove the const qualifier
-    //rs2_intrinsics* non_const_intrin = const_cast<rs2_intrinsics*>(intrin);
-
-    //non_const_intrin->coeffs[0] = 0.01;
-    //non_const_intrin->coeffs[1] = 0.02;
-    //non_const_intrin->coeffs[2] = 0.03;
-    //non_const_intrin->coeffs[3] = 0.04;
-    //non_const_intrin->coeffs[4] = 0.05;
-        
-
-    if(!is_intrinsics_distortion_zero(intrin))
-    {        
-
+    if (!is_intrinsics_distortion_zero(intrin))
+    {
         if (intrin->model == RS2_DISTORTION_INVERSE_BROWN_CONRADY)
         {
-            // need to loop until convergence 
+            // need to loop until convergence
             // 10 iterations determined empirically
             for (int i = 0; i < 10; i++)
             {
@@ -4174,11 +4145,10 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
                 x = (xo - delta_x) * icdist;
                 y = (yo - delta_y) * icdist;
             }
-
         }
         if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
-            // need to loop until convergence 
+            // need to loop until convergence
             // 10 iterations determined empirically
             for (int i = 0; i < 10; i++)
             {
@@ -4189,43 +4159,44 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
                 x = (xo - delta_x) * icdist;
                 y = (yo - delta_y) * icdist;
             }
+
         }
-    }    
-    if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
-    {
-        float rd = sqrtf(x * x + y * y);
-        if (rd < FLT_EPSILON)
+        if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
         {
-            rd = FLT_EPSILON;
-        }
+            float rd = sqrtf(x * x + y * y);
+            if (rd < FLT_EPSILON)
+            {
+                rd = FLT_EPSILON;
+            }
 
-        float theta = rd;
-        float theta2 = rd * rd;
-        for (int i = 0; i < 4; i++)
-        {
-            float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd;
-            if (fabs(f) < FLT_EPSILON)
+            float theta = rd;
+            float theta2 = rd * rd;
+            for (int i = 0; i < 4; i++)
             {
-                break;
+                float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd;
+                if (fabs(f) < FLT_EPSILON)
+                {
+                    break;
+                }
+                float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3])));
+                theta -= f / df;
+                theta2 = theta * theta;
             }
-            float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3])));
-            theta -= f / df;
-            theta2 = theta * theta;
+            float r = tan(theta);
+            x *= r / rd;
+            y *= r / rd;
         }
-        float r = tan(theta);
-        x *= r / rd;
-        y *= r / rd;
-    }
-    if (intrin->model == RS2_DISTORTION_FTHETA)
-    {
-        float rd = sqrtf(x * x + y * y);
-        if (rd < FLT_EPSILON)
+        if (intrin->model == RS2_DISTORTION_FTHETA)
         {
-            rd = FLT_EPSILON;
+            float rd = sqrtf(x * x + y * y);
+            if (rd < FLT_EPSILON)
+            {
+                rd = FLT_EPSILON;
+            }
+            float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
+            x *= r / rd;
+            y *= r / rd;
         }
-        float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
-        x *= r / rd;
-        y *= r / rd;
     }
 
     point[0] = depth * x;

From ec6fbb5823801b9b2530156a17a2ddbd92a66bda Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Wed, 1 Jan 2025 10:46:07 +0200
Subject: [PATCH 04/10] forcing the code to go though the loop

---
 examples/measure/rs-measure.cpp |  2 +-
 src/rs.cpp                      | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp
index 3de67dda88..f34fcbcf51 100644
--- a/examples/measure/rs-measure.cpp
+++ b/examples/measure/rs-measure.cpp
@@ -302,7 +302,7 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v)
     auto end = std::chrono::high_resolution_clock::now();
 
     // Calculate the elapsed time in milliseconds
-    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
 
     // Output the elapsed time
     LOG_ERROR(duration.count());
diff --git a/src/rs.cpp b/src/rs.cpp
index 0876d055ed..cabc5ef68e 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4113,9 +4113,10 @@ NOEXCEPT_RETURN(, pixel)
 /* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
-    return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
-        abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
-        abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
+    return true;
+    //return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
+      //  abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
+        //abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
 }
 
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
@@ -4127,6 +4128,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
 
     float xo = x;
     float yo = y;
+    
 
     if (!is_intrinsics_distortion_zero(intrin))
     {
@@ -4148,9 +4150,10 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         }
         if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
+            int i = 0;
             // need to loop until convergence
             // 10 iterations determined empirically
-            for (int i = 0; i < 10; i++)
+            for (; i < 10; i++)
             {
                 float r2 = x * x + y * y;
                 float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
@@ -4159,6 +4162,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
                 x = (xo - delta_x) * icdist;
                 y = (yo - delta_y) * icdist;
             }
+            LOG_ERROR("i=" << i);
 
         }
         if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
@@ -4198,6 +4202,12 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
             y *= r / rd;
         }
     }
+    else
+    {
+
+        int i = 0;
+        LOG_ERROR("i=" << i);
+    }
 
     point[0] = depth * x;
     point[1] = depth * y;

From fddfd7c250bef9f15021810bbdcac6cc802ebe60 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Mon, 6 Jan 2025 13:40:47 +0200
Subject: [PATCH 05/10] disable BUILD_WITH_CPU_EXTENSIONS SSE disable

---
 CMake/lrs_options.cmake                     |  2 +-
 src/image-avx.cpp                           |  2 +-
 src/image-avx.h                             |  2 +-
 src/proc/align.cpp                          | 11 +++++++++--
 src/proc/color-formats-converter.cpp        | 12 ++++++------
 src/proc/pointcloud.cpp                     |  4 ++--
 src/proc/sse/sse-align.cpp                  |  2 +-
 src/proc/sse/sse-align.h                    |  4 ++--
 src/proc/sse/sse-pointcloud.cpp             |  6 +++---
 src/proc/y411-converter.cpp                 |  6 +++---
 src/proc/y411-converter.h                   |  2 +-
 src/rs.cpp                                  | 13 ++++++-------
 wrappers/opencv/depth-filter/downsample.cpp |  4 ++--
 13 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/CMake/lrs_options.cmake b/CMake/lrs_options.cmake
index 94b25621ac..7cb07c905a 100644
--- a/CMake/lrs_options.cmake
+++ b/CMake/lrs_options.cmake
@@ -42,7 +42,7 @@ if (NOT APPLE)
 else()
     option(CHECK_FOR_UPDATES "Checks for versions updates" OFF) 
 endif()
-option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" ON)
+option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" OFF)
 set(UNIT_TESTS_ARGS "" CACHE STRING "Command-line arguments to pass to unit-tests-config.py, e.g. '-t <tag> -r <regex>'")
 #Performance improvement with Ubuntu 18/20
 if(UNIX AND (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED))
diff --git a/src/image-avx.cpp b/src/image-avx.cpp
index 012c099753..ba3640408a 100644
--- a/src/image-avx.cpp
+++ b/src/image-avx.cpp
@@ -6,7 +6,7 @@
 #include "image-avx.h"
 
 #ifndef ANDROID
-    #if defined(__SSSE3__) && defined(__AVX2__)
+    #if defined(__SSE4__) && defined(__AVX2__)
     #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse
     #include <immintrin.h>
 
diff --git a/src/image-avx.h b/src/image-avx.h
index 9772517bf0..a162681ca3 100644
--- a/src/image-avx.h
+++ b/src/image-avx.h
@@ -10,7 +10,7 @@
 namespace librealsense
 {
 #ifndef ANDROID
-    #if defined(__SSSE3__) && defined(__AVX2__)
+    #if defined(__SSE4__) && defined(__AVX2__)
     void unpack_yuy2_avx_y8(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_y16(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_rgb8(uint8_t * const d[], const uint8_t * s, int n);
diff --git a/src/proc/align.cpp b/src/proc/align.cpp
index a1910290cd..8d8441a525 100644
--- a/src/proc/align.cpp
+++ b/src/proc/align.cpp
@@ -13,7 +13,7 @@
 
 #if defined(RS2_USE_CUDA)
 #include "proc/cuda/cuda-align.h"
-#elif defined(__SSSE3__)
+#elif defined(__SSE4__)
 #include "proc/sse/sse-align.h"
 #endif
 #include "proc/neon/neon-align.h"
@@ -26,7 +26,7 @@ namespace librealsense
     {
         #if defined(RS2_USE_CUDA)
             return std::make_shared<librealsense::align_cuda>(align_to);
-        #elif defined(__SSSE3__)
+        #elif defined(__SSE4__)
             return std::make_shared<librealsense::align_sse>(align_to);
         #elif defined(__ARM_NEON) && ! defined(ANDROID)
             return std::make_shared<librealsense::align_neon>(align_to);
@@ -51,7 +51,14 @@ namespace librealsense
                 {
                     // Map the top-left corner of the depth pixel onto the other image
                     float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2];
+
+                    auto start = std::chrono::high_resolution_clock::now();
                     rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth);
+                    auto end = std::chrono::high_resolution_clock::now();
+                    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+                    // Output the duration
+                    std::cout << duration.count() << "\n";
+
                     rs2_transform_point_to_point(other_point, &depth_to_other, depth_point);
                     rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point);
                     const int other_x0 = static_cast<int>(other_pixel[0] + 0.5f);
diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp
index f0a54ba585..d80a4964e3 100644
--- a/src/proc/color-formats-converter.cpp
+++ b/src/proc/color-formats-converter.cpp
@@ -14,7 +14,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSSE3__
+#ifdef __SSE4__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 #include "neon/image-neon.h"
@@ -60,7 +60,7 @@ namespace librealsense
         rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
         return;
 #endif
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
         static bool do_avx = has_avx();
 #ifdef __AVX2__
 
@@ -477,7 +477,7 @@ namespace librealsense
         }
     }
 
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
     // This method receives 1 line of y and one line of uv.
     // source_chunks_y  // yyyyyyyyyyyyyyyy
     // source_chunks_uv // uvuvuvuvuvuvuvuv
@@ -631,7 +631,7 @@ namespace librealsense
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
 
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
         static bool do_avx = has_avx();
 
         auto src = reinterpret_cast<const __m128i*>(s);
@@ -753,7 +753,7 @@ namespace librealsense
             m420_parse_one_line<FORMAT>(start_of_second_line, start_of_uv, &dst, width);
         }
         return;
-#endif // __SSSE3__
+#endif // __SSE4__
     }
 
     void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, uint8_t * const d[], const uint8_t * s, int w, int h, int actual_size)
@@ -822,7 +822,7 @@ namespace librealsense
     {
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
-#ifdef __SSSE3__
+#ifdef __SSE4__
         auto src = reinterpret_cast<const __m128i *>(s);
         auto dst = reinterpret_cast<__m128i *>(d[0]);
         for (; n; n -= 16)
diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp
index 5b5db2724a..9712c856a2 100644
--- a/src/proc/pointcloud.cpp
+++ b/src/proc/pointcloud.cpp
@@ -19,7 +19,7 @@
 #ifdef RS2_USE_CUDA
 #include "proc/cuda/cuda-pointcloud.h"
 #endif
-#ifdef __SSSE3__
+#ifdef __SSE4__
 #include "proc/sse/sse-pointcloud.h"
 #endif
 #include "proc/neon/neon-pointcloud.h"
@@ -397,7 +397,7 @@ namespace librealsense
     {
         #ifdef RS2_USE_CUDA
             return std::make_shared<librealsense::pointcloud_cuda>();
-        #elif defined(__SSSE3__)
+        #elif defined(__SSE4__)
             return std::make_shared<librealsense::pointcloud_sse>();
         #elif defined(__ARM_NEON)  && ! defined ANDROID
             return std::make_shared<librealsense::pointcloud_neon>();
diff --git a/src/proc/sse/sse-align.cpp b/src/proc/sse/sse-align.cpp
index 904cb8df5c..42b6adeb4a 100644
--- a/src/proc/sse/sse-align.cpp
+++ b/src/proc/sse/sse-align.cpp
@@ -1,6 +1,6 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2019 Intel Corporation. All Rights Reserved.
-#ifdef __SSSE3__
+#ifdef __SSE4__
 
 #include "sse-align.h"
 #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse
diff --git a/src/proc/sse/sse-align.h b/src/proc/sse/sse-align.h
index 133dcde362..d1b4e3107c 100644
--- a/src/proc/sse/sse-align.h
+++ b/src/proc/sse/sse-align.h
@@ -1,7 +1,7 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2024 Intel Corporation. All Rights Reserved.
 #pragma once
-#ifdef __SSSE3__
+#ifdef __SSE4__
 
 #include "proc/align.h"
 #include <src/float3.h>
@@ -87,4 +87,4 @@ namespace librealsense
         std::shared_ptr<image_transform> _stream_transform;
     };
 }
-#endif // __SSSE3__
+#endif // __SSE4__
diff --git a/src/proc/sse/sse-pointcloud.cpp b/src/proc/sse/sse-pointcloud.cpp
index f2f6e63e9e..5aa658261d 100644
--- a/src/proc/sse/sse-pointcloud.cpp
+++ b/src/proc/sse/sse-pointcloud.cpp
@@ -11,7 +11,7 @@
 
 #include <iostream>
 
-#ifdef __SSSE3__
+#ifdef __SSE4__
 
 #include <tmmintrin.h> // For SSSE3 intrinsics
 
@@ -56,7 +56,7 @@ namespace librealsense
             const rs2_intrinsics &depth_intrinsics, 
             const rs2::depth_frame& depth_frame)
     {
-#ifdef __SSSE3__
+#ifdef __SSE4__
 
         auto depth_image = (const uint16_t*)depth_frame.get_data();
 
@@ -145,7 +145,7 @@ namespace librealsense
     {
         auto tex_ptr = texture_map;
 
-#ifdef __SSSE3__
+#ifdef __SSE4__
         auto point = reinterpret_cast<const float*>(points);
         auto res = reinterpret_cast<float*>(tex_ptr);
         auto res1 = reinterpret_cast<float*>(pixels_ptr);
diff --git a/src/proc/y411-converter.cpp b/src/proc/y411-converter.cpp
index 725a9149bc..1ace0b1035 100644
--- a/src/proc/y411-converter.cpp
+++ b/src/proc/y411-converter.cpp
@@ -6,7 +6,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSSE3__
+#ifdef __SSE4__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 
@@ -44,7 +44,7 @@ namespace librealsense
     // See https://www.fourcc.org/pixel-format/yuv-y411/ 
     //
 
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size)
     {
         auto n = w * h;
@@ -297,7 +297,7 @@ namespace librealsense
     // The size of the frame must be bigger than 4 pixels and product of 32
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size )
     {
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
         unpack_y411_sse(dest[0], s, w, h, actual_size);
 #else
         unpack_y411_native(dest[0], s, w, h, actual_size);
diff --git a/src/proc/y411-converter.h b/src/proc/y411-converter.h
index 0c793f74a0..630b34a8d3 100644
--- a/src/proc/y411-converter.h
+++ b/src/proc/y411-converter.h
@@ -24,7 +24,7 @@ namespace librealsense
 
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size);
 
-#if defined __SSSE3__ && ! defined ANDROID
+#if defined __SSE4__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size);
 #endif
 
diff --git a/src/rs.cpp b/src/rs.cpp
index cabc5ef68e..0bb1b1d144 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4113,10 +4113,10 @@ NOEXCEPT_RETURN(, pixel)
 /* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
-    return true;
-    //return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
-      //  abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
-        //abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
+    //return false;
+    return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
+        abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
+       abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
 }
 
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
@@ -4162,7 +4162,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
                 x = (xo - delta_x) * icdist;
                 y = (yo - delta_y) * icdist;
             }
-            LOG_ERROR("i=" << i);
 
         }
         if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
@@ -4205,8 +4204,8 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     else
     {
 
-        int i = 0;
-        LOG_ERROR("i=" << i);
+        //int i = 0;
+        //LOG_ERROR("i=" << i);
     }
 
     point[0] = depth * x;
diff --git a/wrappers/opencv/depth-filter/downsample.cpp b/wrappers/opencv/depth-filter/downsample.cpp
index c4938bf2f2..9b158ccca8 100644
--- a/wrappers/opencv/depth-filter/downsample.cpp
+++ b/wrappers/opencv/depth-filter/downsample.cpp
@@ -5,7 +5,7 @@
 
 #include <assert.h>
 
-#ifdef __SSSE3__
+#ifdef __SSE4__
 #include <emmintrin.h>
 #include <smmintrin.h>
 #endif
@@ -24,7 +24,7 @@ void downsample_min_4x4(const cv::Mat& source, cv::Mat* pDest)
 
     const size_t sizeYresized = source.rows / DOWNSAMPLE_FACTOR;
 
-#ifdef __SSSE3__
+#ifdef __SSE4__
     __m128i ones = _mm_set1_epi16(1);
 
     // Note on multi-threading here, 2018-08-17

From d26ec5c4cc9b2a20c3e49c2e89660690a0a67580 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 7 Jan 2025 09:17:05 +0200
Subject: [PATCH 06/10] clean code

---
 src/proc/align.cpp | 12 ++++++-----
 src/rs.cpp         | 50 ++++++++++++++++++++--------------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/proc/align.cpp b/src/proc/align.cpp
index 8d8441a525..aa78411a64 100644
--- a/src/proc/align.cpp
+++ b/src/proc/align.cpp
@@ -39,6 +39,7 @@ namespace librealsense
     void align_images(const rs2_intrinsics& depth_intrin, const rs2_extrinsics& depth_to_other,
         const rs2_intrinsics& other_intrin, GET_DEPTH get_depth, TRANSFER_PIXEL transfer_pixel)
     {
+        auto start = std::chrono::high_resolution_clock::now();
         // Iterate over the pixels of the depth image
 #pragma omp parallel for schedule(dynamic)
         for (int depth_y = 0; depth_y < depth_intrin.height; ++depth_y)
@@ -52,12 +53,9 @@ namespace librealsense
                     // Map the top-left corner of the depth pixel onto the other image
                     float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2];
 
-                    auto start = std::chrono::high_resolution_clock::now();
+                   
                     rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth);
-                    auto end = std::chrono::high_resolution_clock::now();
-                    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
-                    // Output the duration
-                    std::cout << duration.count() << "\n";
+                    
 
                     rs2_transform_point_to_point(other_point, &depth_to_other, depth_point);
                     rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point);
@@ -86,6 +84,10 @@ namespace librealsense
                 }
             }
         }
+        auto end = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+        // Output the duration
+        std::cout << duration.count() << "\n";
     }
 
     align::align(rs2_stream to_stream) : align(to_stream, "Align")
diff --git a/src/rs.cpp b/src/rs.cpp
index 0bb1b1d144..8fcead23d8 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4164,31 +4164,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
             }
 
         }
-        if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
-        {
-            float rd = sqrtf(x * x + y * y);
-            if (rd < FLT_EPSILON)
-            {
-                rd = FLT_EPSILON;
-            }
-
-            float theta = rd;
-            float theta2 = rd * rd;
-            for (int i = 0; i < 4; i++)
-            {
-                float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd;
-                if (fabs(f) < FLT_EPSILON)
-                {
-                    break;
-                }
-                float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3])));
-                theta -= f / df;
-                theta2 = theta * theta;
-            }
-            float r = tan(theta);
-            x *= r / rd;
-            y *= r / rd;
-        }
         if (intrin->model == RS2_DISTORTION_FTHETA)
         {
             float rd = sqrtf(x * x + y * y);
@@ -4201,11 +4176,30 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
             y *= r / rd;
         }
     }
-    else
+    if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
     {
+        float rd = sqrtf(x * x + y * y);
+        if (rd < FLT_EPSILON)
+        {
+            rd = FLT_EPSILON;
+        }
 
-        //int i = 0;
-        //LOG_ERROR("i=" << i);
+        float theta = rd;
+        float theta2 = rd * rd;
+        for (int i = 0; i < 4; i++)
+        {
+            float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd;
+            if (fabs(f) < FLT_EPSILON)
+            {
+                break;
+            }
+            float df = 1 + theta2 * (3 * intrin->coeffs[0] + theta2 * (5 * intrin->coeffs[1] + theta2 * (7 * intrin->coeffs[2] + 9 * theta2 * intrin->coeffs[3])));
+            theta -= f / df;
+            theta2 = theta * theta;
+        }
+        float r = tan(theta);
+        x *= r / rd;
+        y *= r / rd;
     }
 
     point[0] = depth * x;

From e338df276f8c5c250a41c5c4a19f99e74f6e32b0 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 7 Jan 2025 09:43:23 +0200
Subject: [PATCH 07/10] avoid loop in case intrinsic coefficient are zero

---
 CMake/lrs_options.cmake                     |  2 +-
 examples/measure/rs-measure.cpp             | 17 -----------------
 src/image-avx.cpp                           |  2 +-
 src/image-avx.h                             |  2 +-
 src/proc/align.cpp                          | 13 ++-----------
 src/proc/color-formats-converter.cpp        | 12 ++++++------
 src/proc/pointcloud.cpp                     |  4 ++--
 src/proc/sse/sse-align.cpp                  |  2 +-
 src/proc/sse/sse-align.h                    |  4 ++--
 src/proc/sse/sse-pointcloud.cpp             |  6 +++---
 src/proc/y411-converter.cpp                 |  6 +++---
 src/proc/y411-converter.h                   |  2 +-
 src/rs.cpp                                  | 18 +-----------------
 wrappers/opencv/depth-filter/downsample.cpp |  4 ++--
 14 files changed, 26 insertions(+), 68 deletions(-)

diff --git a/CMake/lrs_options.cmake b/CMake/lrs_options.cmake
index 7cb07c905a..94b25621ac 100644
--- a/CMake/lrs_options.cmake
+++ b/CMake/lrs_options.cmake
@@ -42,7 +42,7 @@ if (NOT APPLE)
 else()
     option(CHECK_FOR_UPDATES "Checks for versions updates" OFF) 
 endif()
-option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" OFF)
+option(BUILD_WITH_CPU_EXTENSIONS "Enable compiler optimizations using CPU extensions (such as AVX)" ON)
 set(UNIT_TESTS_ARGS "" CACHE STRING "Command-line arguments to pass to unit-tests-config.py, e.g. '-t <tag> -r <regex>'")
 #Performance improvement with Ubuntu 18/20
 if(UNIX AND (NOT ANDROID_NDK_TOOLCHAIN_INCLUDED))
diff --git a/examples/measure/rs-measure.cpp b/examples/measure/rs-measure.cpp
index f34fcbcf51..9109c82fd5 100644
--- a/examples/measure/rs-measure.cpp
+++ b/examples/measure/rs-measure.cpp
@@ -15,7 +15,6 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
-#include <rsutils/easylogging/easyloggingpp.h>
 
 using pixel = std::pair<int, int>;
 
@@ -101,7 +100,6 @@ void render_simple_distance(const rs2::depth_frame& depth,
 
 int main(int argc, char * argv[]) try
 {
-    rs2::log_to_console(RS2_LOG_SEVERITY_ERROR);
     auto settings = rs2::cli( "rs-measure example" )
         .process( argc, argv );
 
@@ -293,24 +291,9 @@ float dist_3d(const rs2::depth_frame& frame, pixel u, pixel v)
 
     // Deproject from pixel to point in 3D
     rs2_intrinsics intr = frame.get_profile().as<rs2::video_stream_profile>().get_intrinsics(); // Calibration data
-    // Get the starting time point
-    auto start = std::chrono::high_resolution_clock::now();
-
     rs2_deproject_pixel_to_point(upoint, &intr, upixel, udist);
-
-    // Get the ending time point
-    auto end = std::chrono::high_resolution_clock::now();
-
-    // Calculate the elapsed time in milliseconds
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-
-    // Output the elapsed time
-    LOG_ERROR(duration.count());
-
     rs2_deproject_pixel_to_point(vpoint, &intr, vpixel, vdist);
 
-    
-
     // Calculate euclidean distance between the two points
     return sqrt(pow(upoint[0] - vpoint[0], 2.f) +
                 pow(upoint[1] - vpoint[1], 2.f) +
diff --git a/src/image-avx.cpp b/src/image-avx.cpp
index ba3640408a..012c099753 100644
--- a/src/image-avx.cpp
+++ b/src/image-avx.cpp
@@ -6,7 +6,7 @@
 #include "image-avx.h"
 
 #ifndef ANDROID
-    #if defined(__SSE4__) && defined(__AVX2__)
+    #if defined(__SSSE3__) && defined(__AVX2__)
     #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse
     #include <immintrin.h>
 
diff --git a/src/image-avx.h b/src/image-avx.h
index a162681ca3..9772517bf0 100644
--- a/src/image-avx.h
+++ b/src/image-avx.h
@@ -10,7 +10,7 @@
 namespace librealsense
 {
 #ifndef ANDROID
-    #if defined(__SSE4__) && defined(__AVX2__)
+    #if defined(__SSSE3__) && defined(__AVX2__)
     void unpack_yuy2_avx_y8(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_y16(uint8_t * const d[], const uint8_t * s, int n);
     void unpack_yuy2_avx_rgb8(uint8_t * const d[], const uint8_t * s, int n);
diff --git a/src/proc/align.cpp b/src/proc/align.cpp
index aa78411a64..a1910290cd 100644
--- a/src/proc/align.cpp
+++ b/src/proc/align.cpp
@@ -13,7 +13,7 @@
 
 #if defined(RS2_USE_CUDA)
 #include "proc/cuda/cuda-align.h"
-#elif defined(__SSE4__)
+#elif defined(__SSSE3__)
 #include "proc/sse/sse-align.h"
 #endif
 #include "proc/neon/neon-align.h"
@@ -26,7 +26,7 @@ namespace librealsense
     {
         #if defined(RS2_USE_CUDA)
             return std::make_shared<librealsense::align_cuda>(align_to);
-        #elif defined(__SSE4__)
+        #elif defined(__SSSE3__)
             return std::make_shared<librealsense::align_sse>(align_to);
         #elif defined(__ARM_NEON) && ! defined(ANDROID)
             return std::make_shared<librealsense::align_neon>(align_to);
@@ -39,7 +39,6 @@ namespace librealsense
     void align_images(const rs2_intrinsics& depth_intrin, const rs2_extrinsics& depth_to_other,
         const rs2_intrinsics& other_intrin, GET_DEPTH get_depth, TRANSFER_PIXEL transfer_pixel)
     {
-        auto start = std::chrono::high_resolution_clock::now();
         // Iterate over the pixels of the depth image
 #pragma omp parallel for schedule(dynamic)
         for (int depth_y = 0; depth_y < depth_intrin.height; ++depth_y)
@@ -52,11 +51,7 @@ namespace librealsense
                 {
                     // Map the top-left corner of the depth pixel onto the other image
                     float depth_pixel[2] = { depth_x - 0.5f, depth_y - 0.5f }, depth_point[3], other_point[3], other_pixel[2];
-
-                   
                     rs2_deproject_pixel_to_point(depth_point, &depth_intrin, depth_pixel, depth);
-                    
-
                     rs2_transform_point_to_point(other_point, &depth_to_other, depth_point);
                     rs2_project_point_to_pixel(other_pixel, &other_intrin, other_point);
                     const int other_x0 = static_cast<int>(other_pixel[0] + 0.5f);
@@ -84,10 +79,6 @@ namespace librealsense
                 }
             }
         }
-        auto end = std::chrono::high_resolution_clock::now();
-        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
-        // Output the duration
-        std::cout << duration.count() << "\n";
     }
 
     align::align(rs2_stream to_stream) : align(to_stream, "Align")
diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp
index d80a4964e3..f0a54ba585 100644
--- a/src/proc/color-formats-converter.cpp
+++ b/src/proc/color-formats-converter.cpp
@@ -14,7 +14,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 #include "neon/image-neon.h"
@@ -60,7 +60,7 @@ namespace librealsense
         rscuda::unpack_yuy2_cuda<FORMAT>(d, s, n);
         return;
 #endif
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         static bool do_avx = has_avx();
 #ifdef __AVX2__
 
@@ -477,7 +477,7 @@ namespace librealsense
         }
     }
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     // This method receives 1 line of y and one line of uv.
     // source_chunks_y  // yyyyyyyyyyyyyyyy
     // source_chunks_uv // uvuvuvuvuvuvuvuv
@@ -631,7 +631,7 @@ namespace librealsense
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         static bool do_avx = has_avx();
 
         auto src = reinterpret_cast<const __m128i*>(s);
@@ -753,7 +753,7 @@ namespace librealsense
             m420_parse_one_line<FORMAT>(start_of_second_line, start_of_uv, &dst, width);
         }
         return;
-#endif // __SSE4__
+#endif // __SSSE3__
     }
 
     void unpack_yuy2(rs2_format dst_format, rs2_stream dst_stream, uint8_t * const d[], const uint8_t * s, int w, int h, int actual_size)
@@ -822,7 +822,7 @@ namespace librealsense
     {
         auto n = width * height;
         assert(n % 16 == 0); // All currently supported color resolutions are multiples of 16 pixels. Could easily extend support to other resolutions by copying final n<16 pixels into a zero-padded buffer and recursively calling self for final iteration.
-#ifdef __SSE4__
+#ifdef __SSSE3__
         auto src = reinterpret_cast<const __m128i *>(s);
         auto dst = reinterpret_cast<__m128i *>(d[0]);
         for (; n; n -= 16)
diff --git a/src/proc/pointcloud.cpp b/src/proc/pointcloud.cpp
index 9712c856a2..5b5db2724a 100644
--- a/src/proc/pointcloud.cpp
+++ b/src/proc/pointcloud.cpp
@@ -19,7 +19,7 @@
 #ifdef RS2_USE_CUDA
 #include "proc/cuda/cuda-pointcloud.h"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include "proc/sse/sse-pointcloud.h"
 #endif
 #include "proc/neon/neon-pointcloud.h"
@@ -397,7 +397,7 @@ namespace librealsense
     {
         #ifdef RS2_USE_CUDA
             return std::make_shared<librealsense::pointcloud_cuda>();
-        #elif defined(__SSE4__)
+        #elif defined(__SSSE3__)
             return std::make_shared<librealsense::pointcloud_sse>();
         #elif defined(__ARM_NEON)  && ! defined ANDROID
             return std::make_shared<librealsense::pointcloud_neon>();
diff --git a/src/proc/sse/sse-align.cpp b/src/proc/sse/sse-align.cpp
index 42b6adeb4a..904cb8df5c 100644
--- a/src/proc/sse/sse-align.cpp
+++ b/src/proc/sse/sse-align.cpp
@@ -1,6 +1,6 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2019 Intel Corporation. All Rights Reserved.
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include "sse-align.h"
 #include <tmmintrin.h> // For SSE3 intrinsic used in unpack_yuy2_sse
diff --git a/src/proc/sse/sse-align.h b/src/proc/sse/sse-align.h
index d1b4e3107c..133dcde362 100644
--- a/src/proc/sse/sse-align.h
+++ b/src/proc/sse/sse-align.h
@@ -1,7 +1,7 @@
 // License: Apache 2.0. See LICENSE file in root directory.
 // Copyright(c) 2024 Intel Corporation. All Rights Reserved.
 #pragma once
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include "proc/align.h"
 #include <src/float3.h>
@@ -87,4 +87,4 @@ namespace librealsense
         std::shared_ptr<image_transform> _stream_transform;
     };
 }
-#endif // __SSE4__
+#endif // __SSSE3__
diff --git a/src/proc/sse/sse-pointcloud.cpp b/src/proc/sse/sse-pointcloud.cpp
index 5aa658261d..f2f6e63e9e 100644
--- a/src/proc/sse/sse-pointcloud.cpp
+++ b/src/proc/sse/sse-pointcloud.cpp
@@ -11,7 +11,7 @@
 
 #include <iostream>
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
 #include <tmmintrin.h> // For SSSE3 intrinsics
 
@@ -56,7 +56,7 @@ namespace librealsense
             const rs2_intrinsics &depth_intrinsics, 
             const rs2::depth_frame& depth_frame)
     {
-#ifdef __SSE4__
+#ifdef __SSSE3__
 
         auto depth_image = (const uint16_t*)depth_frame.get_data();
 
@@ -145,7 +145,7 @@ namespace librealsense
     {
         auto tex_ptr = texture_map;
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
         auto point = reinterpret_cast<const float*>(points);
         auto res = reinterpret_cast<float*>(tex_ptr);
         auto res1 = reinterpret_cast<float*>(pixels_ptr);
diff --git a/src/proc/y411-converter.cpp b/src/proc/y411-converter.cpp
index 1ace0b1035..725a9149bc 100644
--- a/src/proc/y411-converter.cpp
+++ b/src/proc/y411-converter.cpp
@@ -6,7 +6,7 @@
 #ifdef RS2_USE_CUDA
 #include "cuda/cuda-conversion.cuh"
 #endif
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <tmmintrin.h> // For SSSE3 intrinsics
 #endif
 
@@ -44,7 +44,7 @@ namespace librealsense
     // See https://www.fourcc.org/pixel-format/yuv-y411/ 
     //
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size)
     {
         auto n = w * h;
@@ -297,7 +297,7 @@ namespace librealsense
     // The size of the frame must be bigger than 4 pixels and product of 32
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size )
     {
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
         unpack_y411_sse(dest[0], s, w, h, actual_size);
 #else
         unpack_y411_native(dest[0], s, w, h, actual_size);
diff --git a/src/proc/y411-converter.h b/src/proc/y411-converter.h
index 630b34a8d3..0c793f74a0 100644
--- a/src/proc/y411-converter.h
+++ b/src/proc/y411-converter.h
@@ -24,7 +24,7 @@ namespace librealsense
 
     void unpack_y411( uint8_t * const dest[], const uint8_t * const s, int w, int h, int actual_size);
 
-#if defined __SSE4__ && ! defined ANDROID
+#if defined __SSSE3__ && ! defined ANDROID
     void unpack_y411_sse( uint8_t * const dest, const uint8_t * const s, int w, int h, int actual_size);
 #endif
 
diff --git a/src/rs.cpp b/src/rs.cpp
index 8fcead23d8..07645f7927 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -64,11 +64,6 @@
 #include <src/core/time-service.h>
 #include <rsutils/string/from.h>
 
-#include <iostream>
-#include <chrono>
-#include <thread> // For std::this_thread::sleep_for
-#include <random>  // For random number generation
-
 ////////////////////////
 // API implementation //
 ////////////////////////
@@ -4113,7 +4108,6 @@ NOEXCEPT_RETURN(, pixel)
 /* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
-    //return false;
     return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
         abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
        abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
@@ -4150,10 +4144,9 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         }
         if (intrin->model == RS2_DISTORTION_BROWN_CONRADY)
         {
-            int i = 0;
             // need to loop until convergence
             // 10 iterations determined empirically
-            for (; i < 10; i++)
+            for (int i = 0; i < 10; i++)
             {
                 float r2 = x * x + y * y;
                 float icdist = (float)1 / (float)(1 + ((intrin->coeffs[4] * r2 + intrin->coeffs[1]) * r2 + intrin->coeffs[0]) * r2);
@@ -4205,15 +4198,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
     point[0] = depth * x;
     point[1] = depth * y;
     point[2] = depth;
-
-    // Get the ending time point
-    //auto end = std::chrono::high_resolution_clock::now();
-
-    // Calculate the elapsed time in milliseconds
-    //auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
-
-    // Output the elapsed time
-    //std::cout << duration.count()  << std::endl;
 }
 NOEXCEPT_RETURN(, point)
 
diff --git a/wrappers/opencv/depth-filter/downsample.cpp b/wrappers/opencv/depth-filter/downsample.cpp
index 9b158ccca8..c4938bf2f2 100644
--- a/wrappers/opencv/depth-filter/downsample.cpp
+++ b/wrappers/opencv/depth-filter/downsample.cpp
@@ -5,7 +5,7 @@
 
 #include <assert.h>
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
 #include <emmintrin.h>
 #include <smmintrin.h>
 #endif
@@ -24,7 +24,7 @@ void downsample_min_4x4(const cv::Mat& source, cv::Mat* pDest)
 
     const size_t sizeYresized = source.rows / DOWNSAMPLE_FACTOR;
 
-#ifdef __SSE4__
+#ifdef __SSSE3__
     __m128i ones = _mm_set1_epi16(1);
 
     // Note on multi-threading here, 2018-08-17

From 5e1a37d06a7f30e2aee9bc711354f52162b2d6cd Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 7 Jan 2025 10:55:38 +0200
Subject: [PATCH 08/10] in case coeff[0] is zero, avoid division by zero

---
 src/rs.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/rs.cpp b/src/rs.cpp
index 07645f7927..d28c099be3 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4157,17 +4157,6 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
             }
 
         }
-        if (intrin->model == RS2_DISTORTION_FTHETA)
-        {
-            float rd = sqrtf(x * x + y * y);
-            if (rd < FLT_EPSILON)
-            {
-                rd = FLT_EPSILON;
-            }
-            float r = (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
-            x *= r / rd;
-            y *= r / rd;
-        }
     }
     if (intrin->model == RS2_DISTORTION_KANNALA_BRANDT4)
     {
@@ -4194,6 +4183,17 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         x *= r / rd;
         y *= r / rd;
     }
+    if (intrin->model == RS2_DISTORTION_FTHETA)
+    {
+        float rd = sqrtf(x * x + y * y);
+        if (rd < FLT_EPSILON)
+        {
+            rd = FLT_EPSILON;
+        }
+        float r = (intrin->coeffs[0] < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
+        x *= r / rd;
+        y *= r / rd;
+    }
 
     point[0] = depth * x;
     point[1] = depth * y;

From 2308519eeeb988b7a69943337c26c3000968f7b8 Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 7 Jan 2025 13:34:32 +0200
Subject: [PATCH 09/10] add fabs to protect from negative numbers

---
 src/rs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rs.cpp b/src/rs.cpp
index d28c099be3..84898d51c1 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4190,7 +4190,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         {
             rd = FLT_EPSILON;
         }
-        float r = (intrin->coeffs[0] < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
+        float r = (fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
         x *= r / rd;
         y *= r / rd;
     }

From e2a18b223c1956d18c95690db736f62ffdfebfab Mon Sep 17 00:00:00 2001
From: gilpazintel <gil.paz@intel.com>
Date: Tue, 7 Jan 2025 18:49:13 +0200
Subject: [PATCH 10/10] changed abs to std::fabs, also changed double numeric
 limit to float numeric limit

---
 src/rs.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/rs.cpp b/src/rs.cpp
index 84898d51c1..d274bdcd7e 100644
--- a/src/rs.cpp
+++ b/src/rs.cpp
@@ -4108,9 +4108,9 @@ NOEXCEPT_RETURN(, pixel)
 /* Helper inner function (not part of the API) */
 inline bool is_intrinsics_distortion_zero(const struct rs2_intrinsics* intrin)
 {
-    return (abs(intrin->coeffs[0]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[1]) < std::numeric_limits<double>::epsilon() &&
-        abs(intrin->coeffs[2]) < std::numeric_limits<double>::epsilon() && abs(intrin->coeffs[3]) < std::numeric_limits<double>::epsilon() &&
-       abs(intrin->coeffs[4]) < std::numeric_limits<double>::epsilon());
+    return (std::fabs(intrin->coeffs[0]) < FLT_EPSILON && std::fabs(intrin->coeffs[1]) < FLT_EPSILON &&
+            std::fabs(intrin->coeffs[2]) < FLT_EPSILON && std::fabs(intrin->coeffs[3]) < FLT_EPSILON &&
+            std::fabs(intrin->coeffs[4]) < FLT_EPSILON);
 }
 
 void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* intrin, const float pixel[2], float depth) BEGIN_API_CALL
@@ -4171,7 +4171,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         for (int i = 0; i < 4; i++)
         {
             float f = theta * (1 + theta2 * (intrin->coeffs[0] + theta2 * (intrin->coeffs[1] + theta2 * (intrin->coeffs[2] + theta2 * intrin->coeffs[3])))) - rd;
-            if (fabs(f) < FLT_EPSILON)
+            if (std::fabs(f) < FLT_EPSILON)
             {
                 break;
             }
@@ -4190,7 +4190,7 @@ void rs2_deproject_pixel_to_point(float point[3], const struct rs2_intrinsics* i
         {
             rd = FLT_EPSILON;
         }
-        float r = (fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
+        float r = (std::fabs(intrin->coeffs[0]) < FLT_EPSILON) ? 0.0f : (float)(tan(intrin->coeffs[0] * rd) / atan(2 * tan(intrin->coeffs[0] / 2.0f)));
         x *= r / rd;
         y *= r / rd;
     }