Merge pull request #1256 from mabel-dev/#1254

#1254
mabel-dev · Nov 11, 2023 · 80a94f7 · 80a94f7
2 parents 23115af + 99a47c0
commit 80a94f7
Show file tree

Hide file tree

Showing 11 changed files with 59 additions and 25 deletions.
diff --git a/opteryx/cursor.py b/opteryx/cursor.py
@@ -163,12 +163,16 @@ def _inner_execute(self, operation: str, params: Optional[Iterable] = None) -> A
             raise MissingSqlStatement("SQL provided was empty.")
 
         self._connection.context.history.append((operation, True, datetime.datetime.utcnow()))
+
+        start = time.time_ns()
         plans = query_planner(
             operation=operation, parameters=params, connection=self._connection, qid=self.id
         )
 
         try:
+            start = time.time_ns()
             first_item = next(plans)
+            self._statistics.time_planning += time.time_ns() - start
         except RuntimeError:
             raise MissingSqlStatement(
                 "SQL statement provided had no executable part, this may mean the statement was commented out."
@@ -180,8 +184,11 @@ def _inner_execute(self, operation: str, params: Optional[Iterable] = None) -> A
             ROLLING_LOG.append(operation)
 
         results = None
+        start = time.time_ns()
         for plan in plans:
+            self._statistics.time_planning += time.time_ns() - start
             results = plan.execute()
+            start = time.time_ns()
 
         if results is not None:
             # we can't update tuples directly
@@ -204,6 +211,8 @@ def _execute_statements(self, operation, params: Optional[Iterable] = None):
         Returns:
             Results of the query execution, if any.
         """
+        self._statistics.start_time = time.time_ns()
+
         statements = sql.remove_comments(operation)
         statements = sql.clean_statement(statements)
         statements = sql.split_sql_statements(statements)

diff --git a/opteryx/functions/__init__.py b/opteryx/functions/__init__.py
@@ -319,7 +319,7 @@ def _coalesce(*arrays):
     "TODAY": _repeat_no_parameters(date_functions.get_today),
     "TIME": _repeat_no_parameters(date_functions.get_time),
     "YESTERDAY": _repeat_no_parameters(date_functions.get_yesterday),
-    "DATE": _iterate_single_parameter(date_functions.get_date),
+    "DATE": lambda x: compute.cast(x, "date32"), #_iterate_single_parameter(date_functions.get_date),
     "YEAR": compute.year,
     "MONTH": compute.month,
     "DAY": compute.day,

diff --git a/opteryx/operators/exit_node.py b/opteryx/operators/exit_node.py
@@ -23,6 +23,7 @@
 
 This node doesn't do any calculations, it is a pure Projection.
 """
+import time
 from typing import Iterable
 
 from opteryx.exceptions import AmbiguousIdentifierError
@@ -45,6 +46,7 @@ def name(self):  # pragma: no cover
         return "Exit"
 
     def execute(self) -> Iterable:
+        start = time.monotonic_ns()
         morsels = self._producers[0]  # type:ignore
 
         final_columns = []
@@ -62,7 +64,9 @@ def execute(self) -> Iterable:
                 message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`"
             )
 
+        self.statistics.time_exiting += time.monotonic_ns() - start
         for morsel in morsels.execute():
+            start = time.monotonic_ns()
             if not set(final_columns).issubset(morsel.column_names):
                 mapping = {int_name: name for name, int_name in zip(final_columns, final_names)}
                 missing_references = [
@@ -76,4 +80,6 @@ def execute(self) -> Iterable:
             morsel = morsel.select(final_columns)
             morsel = morsel.rename_columns(final_names)
 
+            self.statistics.time_exiting += time.monotonic_ns() - start
             yield morsel
+            start = time.monotonic_ns()
diff --git a/opteryx/operators/scanner_node.py b/opteryx/operators/scanner_node.py
@@ -79,13 +79,15 @@ def config(self):
 
     def execute(self) -> Iterable:
         """Perform this step, time how long is spent doing work"""
+        morsel = None
         schema = self.parameters["schema"]
         start_clock = time.monotonic_ns()
         reader = self.parameters.get("connector").read_dataset()
         for morsel in reader:
+            self.statistics.blobs_read += 1
             self.statistics.rows_read += morsel.num_rows
             self.statistics.bytes_processed += morsel.nbytes
-            self.execution_time += time.monotonic_ns() - start_clock
+            self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock
             yield normalize_morsel(schema, morsel)
             start_clock = time.monotonic_ns()
         if morsel:

diff --git a/opteryx/shared/query_statistics.py b/opteryx/shared/query_statistics.py
@@ -54,7 +54,7 @@ def as_dict(self):
             if k.startswith("time_"):
                 stats_dict[k] = self._ns_to_s(v)
         stats_dict["time_total"] = self._ns_to_s(
-            stats_dict.get("end_time", 0) - stats_dict.get("start_time", 0)
+            stats_dict.pop("end_time", 0) - stats_dict.pop("start_time", 0)
         )
         stats_dict["messages"] = stats_dict.get("messages", [])
         return stats_dict

diff --git a/testdata/flat/formats/misnamed_parquet/tweets.jsonl.parquet b/testdata/flat/formats/misnamed_parquet/tweets.jsonl.parquet
diff --git a/tests/misc/test_documentation.py b/tests/misc/test_documentation.py
@@ -101,12 +101,14 @@ def test_python_client():
     conn = opteryx.connect()
     cursor = conn.cursor()
     cursor.execute("SELECT * FROM $planets;")
-    rows_first = list(cursor.fetchall())
+    # if we convert to bytes we're comparing the values only
+    rows_first = b"".join([r.as_bytes for r in cursor.fetchall()])
 
     import opteryx
 
     cursor = opteryx.query("SELECT * FROM $planets;")
-    rows_second = list(cursor.fetchall())
+    # if we convert to bytes we're comparing the values only
+    rows_second = b"".join([r.as_bytes for r in cursor.fetchall()])
 
     assert rows_first == rows_second
 
@@ -204,6 +206,4 @@ def get_user_permissions(user_roles):
 if __name__ == "__main__":  # pragma: no cover
     from tests.tools import run_tests
 
-    test_readme_3()
-
     run_tests()
diff --git a/tests/sql_battery/test_battery_formats.py b/tests/sql_battery/test_battery_formats.py
@@ -48,6 +48,10 @@
         # tsv - has the same file as csv
         ("SELECT * FROM testdata.flat.formats.tsv WITH (NO_PARTITION)", 33529, 10, False),
         ("SELECT username, user_verified FROM testdata.flat.formats.tsv WITH(NO_PARTITION) WHERE username ILIKE '%cve%'", 2532, 2, False),
+
+        # .json.parquet - appears to be handled incorrectly
+        ("SELECT * FROM testdata.flat.formats.misnamed_parquet WITH (NO_PARTITION)", 100000, 13, False),
+        ("SELECT user_name, user_verified FROM testdata.flat.formats.misnamed_parquet WITH(NO_PARTITION) WHERE user_name ILIKE '%news%'", 122, 2, False),
     ]
 # fmt:on
 

diff --git a/tests/sql_battery/test_null_semantics.py b/tests/sql_battery/test_null_semantics.py
@@ -10,6 +10,7 @@
 
 import pytest
 import opteryx
+import numpy
 
 # fmt:off
 STATEMENTS = [
@@ -80,47 +81,47 @@
 SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT FALSE;
 """, {True, None}),(
 """
--- Query 1: Expected rows: 1 ("true")
+-- Query 14: Expected rows: 1 ("true")
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool = 'true';
 """, {'true'}),(
 """
--- Query 2: Expected rows: 1 (NULL)
+-- Query 15: Expected rows: 1 (NULL)
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool IS NULL;
 """, {None}),(
 """
--- Query 3: Expected rows: 2 ("true", "false")
+-- Query 16: Expected rows: 2 ("true", "false")
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT NULL;
 """, {'true', 'false'}),(
 """
--- Query 4: Expected rows: 0
+-- Query 17: Expected rows: 0
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool = NULL;
 """, {}),(
 """
--- Query 5: Expected rows: 2 ("true", "false")
+-- Query 18: Expected rows: 2 ("true", "false")
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE NOT bool IS NULL;
 """, {'true', 'false'}),(
 """
--- Query 6: Expected rows: 1 ("false")
+-- Query 19: Expected rows: 1 ("false")
 SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE NOT bool = "true";
 """, {'false'}),(
 """
--- Query 1: Expected rows: 1 (1)
+-- Query 20: Expected rows: 1 (1)
 SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool = 1;
 """, {1}),(
 """
--- Query 2: Expected rows: 1 (NULL)
+-- Query 21: Expected rows: 1 (NULL)
 SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool IS NULL;
-""", {None}),(
+""", {numpy.nan}),(
 """
--- Query 3: Expected rows: 2 (1, -1)
+-- Query 22: Expected rows: 2 (1, -1)
 SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT NULL;
 """, {1, -1}),(
 """
--- Query 4: Expected rows: 0
+-- Query 23: Expected rows: 0
 SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool = NULL;
 """, {}),(
 """
--- Query 5: Expected rows: 3 (1, -1, NULL)
+-- Query 24: Expected rows: 3 (1, -1, NULL)
 SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE NOT bool IS NULL;
 """, {1, -1})
 
@@ -130,10 +131,22 @@
 # fmt:on
 
 
+def process_set(set_with_nan):
+    has_nan = any(item != item for item in set_with_nan)  # Check for NaN using NaN's property
+    set_without_nan = {
+        item for item in set_with_nan if item == item
+    }  # Create a new set without NaNs
+    return has_nan, set_without_nan
+
+
 def compare_sets(set1, set2):
     if not set1 and not set2:
         return True
-    return set1 == set2
+
+    s1_nan, s1_no_nan = process_set(set1)
+    s2_nan, s2_no_nan = process_set(set2)
+
+    return s1_nan == s2_nan and s1_no_nan == s2_no_nan
 
 
 @pytest.mark.parametrize("statement, expected_result", STATEMENTS)

diff --git a/tests/sql_battery/test_results_battery.py b/tests/sql_battery/test_results_battery.py
@@ -51,11 +51,11 @@ def test_results_tests(test):
     cursor.execute(sql)
     result = cursor.arrow().to_pydict()
 
-    printable_result = orjson.dumps(result, default=str).decode()
-    printable_expected = orjson.dumps(test["result"]).decode()
+    printable_result = orjson.dumps(result, default=str, option=orjson.OPT_SORT_KEYS).decode()
+    printable_expected = orjson.dumps(test["result"], option=orjson.OPT_SORT_KEYS).decode()
 
     assert (
-        result == test["result"]
+        printable_result == printable_expected
     ), f"Outcome:\n{printable_result}\nExpected:\n{printable_expected}"
 
 

diff --git a/tests/sql_battery/tests/results/complex_003.results_tests b/tests/sql_battery/tests/results/complex_003.results_tests
@@ -3,8 +3,8 @@
   "statement": "SELECT pl.name AS planet_name, pl.orbital_period, pl.diameter, dense_moons_stats.total_dense_moons, dense_moons_stats.avg_density, bright_moons_stats.avg_magnitude, bright_moons_stats.total_bright_moons FROM $planets pl LEFT JOIN (SELECT planetId, COUNT(*) AS total_dense_moons, AVG(density) AS avg_density FROM $satellites WHERE density > 2 GROUP BY planetId) dense_moons_stats ON pl.id = dense_moons_stats.planetId LEFT JOIN (SELECT planetId, AVG(magnitude) AS avg_magnitude, COUNT(*) AS total_bright_moons FROM $satellites WHERE magnitude < 5 GROUP BY planetId) bright_moons_stats ON pl.id = bright_moons_stats.planetId WHERE pl.distance_from_sun BETWEEN 100 AND 200 AND pl.orbital_eccentricity < 0.1 ORDER BY dense_moons_stats.total_dense_moons DESC, bright_moons_stats.avg_magnitude ASC LIMIT 10;",
   "result": {
     "bright_moons_stats.avg_magnitude": [-12.74, null],
-    "bright_moons_stats.total_bright_moons": [1, null],
-    "dense_moons_stats.total_dense_moons": [1, null],
+    "bright_moons_stats.total_bright_moons": [1.0, null],
+    "dense_moons_stats.total_dense_moons": [1.0, null],
     "dense_moons_stats.avg_density": [3.344, null],
     "planet_name": ["Earth", "Venus"],
     "pl.diameter": [12756, 12104],