Skip to content

Commit

Permalink
Merge pull request #1256 from mabel-dev/#1254
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Nov 11, 2023
2 parents 23115af + 99a47c0 commit 80a94f7
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 25 deletions.
9 changes: 9 additions & 0 deletions opteryx/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,16 @@ def _inner_execute(self, operation: str, params: Optional[Iterable] = None) -> A
raise MissingSqlStatement("SQL provided was empty.")

self._connection.context.history.append((operation, True, datetime.datetime.utcnow()))

start = time.time_ns()
plans = query_planner(
operation=operation, parameters=params, connection=self._connection, qid=self.id
)

try:
start = time.time_ns()
first_item = next(plans)
self._statistics.time_planning += time.time_ns() - start
except RuntimeError:
raise MissingSqlStatement(
"SQL statement provided had no executable part, this may mean the statement was commented out."
Expand All @@ -180,8 +184,11 @@ def _inner_execute(self, operation: str, params: Optional[Iterable] = None) -> A
ROLLING_LOG.append(operation)

results = None
start = time.time_ns()
for plan in plans:
self._statistics.time_planning += time.time_ns() - start
results = plan.execute()
start = time.time_ns()

if results is not None:
# we can't update tuples directly
Expand All @@ -204,6 +211,8 @@ def _execute_statements(self, operation, params: Optional[Iterable] = None):
Returns:
Results of the query execution, if any.
"""
self._statistics.start_time = time.time_ns()

statements = sql.remove_comments(operation)
statements = sql.clean_statement(statements)
statements = sql.split_sql_statements(statements)
Expand Down
2 changes: 1 addition & 1 deletion opteryx/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def _coalesce(*arrays):
"TODAY": _repeat_no_parameters(date_functions.get_today),
"TIME": _repeat_no_parameters(date_functions.get_time),
"YESTERDAY": _repeat_no_parameters(date_functions.get_yesterday),
"DATE": _iterate_single_parameter(date_functions.get_date),
"DATE": lambda x: compute.cast(x, "date32"), #_iterate_single_parameter(date_functions.get_date),
"YEAR": compute.year,
"MONTH": compute.month,
"DAY": compute.day,
Expand Down
6 changes: 6 additions & 0 deletions opteryx/operators/exit_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
This node doesn't do any calculations, it is a pure Projection.
"""
import time
from typing import Iterable

from opteryx.exceptions import AmbiguousIdentifierError
Expand All @@ -45,6 +46,7 @@ def name(self): # pragma: no cover
return "Exit"

def execute(self) -> Iterable:
start = time.monotonic_ns()
morsels = self._producers[0] # type:ignore

final_columns = []
Expand All @@ -62,7 +64,9 @@ def execute(self) -> Iterable:
message=f"Query result contains multiple instances of the same column(s) - `{'`, `'.join(matches)}`"
)

self.statistics.time_exiting += time.monotonic_ns() - start
for morsel in morsels.execute():
start = time.monotonic_ns()
if not set(final_columns).issubset(morsel.column_names):
mapping = {int_name: name for name, int_name in zip(final_columns, final_names)}
missing_references = [
Expand All @@ -76,4 +80,6 @@ def execute(self) -> Iterable:
morsel = morsel.select(final_columns)
morsel = morsel.rename_columns(final_names)

self.statistics.time_exiting += time.monotonic_ns() - start
yield morsel
start = time.monotonic_ns()
4 changes: 3 additions & 1 deletion opteryx/operators/scanner_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,15 @@ def config(self):

def execute(self) -> Iterable:
"""Perform this step, time how long is spent doing work"""
morsel = None
schema = self.parameters["schema"]
start_clock = time.monotonic_ns()
reader = self.parameters.get("connector").read_dataset()
for morsel in reader:
self.statistics.blobs_read += 1
self.statistics.rows_read += morsel.num_rows
self.statistics.bytes_processed += morsel.nbytes
self.execution_time += time.monotonic_ns() - start_clock
self.statistics.time_reading_blobs += time.monotonic_ns() - start_clock
yield normalize_morsel(schema, morsel)
start_clock = time.monotonic_ns()
if morsel:
Expand Down
2 changes: 1 addition & 1 deletion opteryx/shared/query_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def as_dict(self):
if k.startswith("time_"):
stats_dict[k] = self._ns_to_s(v)
stats_dict["time_total"] = self._ns_to_s(
stats_dict.get("end_time", 0) - stats_dict.get("start_time", 0)
stats_dict.pop("end_time", 0) - stats_dict.pop("start_time", 0)
)
stats_dict["messages"] = stats_dict.get("messages", [])
return stats_dict
Expand Down
Binary file not shown.
8 changes: 4 additions & 4 deletions tests/misc/test_documentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,14 @@ def test_python_client():
conn = opteryx.connect()
cursor = conn.cursor()
cursor.execute("SELECT * FROM $planets;")
rows_first = list(cursor.fetchall())
# if we convert to bytes we're comparing the values only
rows_first = b"".join([r.as_bytes for r in cursor.fetchall()])

import opteryx

cursor = opteryx.query("SELECT * FROM $planets;")
rows_second = list(cursor.fetchall())
# if we convert to bytes we're comparing the values only
rows_second = b"".join([r.as_bytes for r in cursor.fetchall()])

assert rows_first == rows_second

Expand Down Expand Up @@ -204,6 +206,4 @@ def get_user_permissions(user_roles):
if __name__ == "__main__": # pragma: no cover
from tests.tools import run_tests

test_readme_3()

run_tests()
4 changes: 4 additions & 0 deletions tests/sql_battery/test_battery_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
# tsv - has the same file as csv
("SELECT * FROM testdata.flat.formats.tsv WITH (NO_PARTITION)", 33529, 10, False),
("SELECT username, user_verified FROM testdata.flat.formats.tsv WITH(NO_PARTITION) WHERE username ILIKE '%cve%'", 2532, 2, False),

# .json.parquet - appears to be handled incorrectly
("SELECT * FROM testdata.flat.formats.misnamed_parquet WITH (NO_PARTITION)", 100000, 13, False),
("SELECT user_name, user_verified FROM testdata.flat.formats.misnamed_parquet WITH(NO_PARTITION) WHERE user_name ILIKE '%news%'", 122, 2, False),
]
# fmt:on

Expand Down
39 changes: 26 additions & 13 deletions tests/sql_battery/test_null_semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import pytest
import opteryx
import numpy

# fmt:off
STATEMENTS = [
Expand Down Expand Up @@ -80,47 +81,47 @@
SELECT * FROM (VALUES (True), (False), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT FALSE;
""", {True, None}),(
"""
-- Query 1: Expected rows: 1 ("true")
-- Query 14: Expected rows: 1 ("true")
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool = 'true';
""", {'true'}),(
"""
-- Query 2: Expected rows: 1 (NULL)
-- Query 15: Expected rows: 1 (NULL)
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool IS NULL;
""", {None}),(
"""
-- Query 3: Expected rows: 2 ("true", "false")
-- Query 16: Expected rows: 2 ("true", "false")
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT NULL;
""", {'true', 'false'}),(
"""
-- Query 4: Expected rows: 0
-- Query 17: Expected rows: 0
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE bool = NULL;
""", {}),(
"""
-- Query 5: Expected rows: 2 ("true", "false")
-- Query 18: Expected rows: 2 ("true", "false")
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE NOT bool IS NULL;
""", {'true', 'false'}),(
"""
-- Query 6: Expected rows: 1 ("false")
-- Query 19: Expected rows: 1 ("false")
SELECT * FROM (VALUES ('true'), ('false'), (NULL)) AS tristatebooleans(bool) WHERE NOT bool = "true";
""", {'false'}),(
"""
-- Query 1: Expected rows: 1 (1)
-- Query 20: Expected rows: 1 (1)
SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool = 1;
""", {1}),(
"""
-- Query 2: Expected rows: 1 (NULL)
-- Query 21: Expected rows: 1 (NULL)
SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool IS NULL;
""", {None}),(
""", {numpy.nan}),(
"""
-- Query 3: Expected rows: 2 (1, -1)
-- Query 22: Expected rows: 2 (1, -1)
SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool IS NOT NULL;
""", {1, -1}),(
"""
-- Query 4: Expected rows: 0
-- Query 23: Expected rows: 0
SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE bool = NULL;
""", {}),(
"""
-- Query 5: Expected rows: 3 (1, -1, NULL)
-- Query 24: Expected rows: 3 (1, -1, NULL)
SELECT * FROM (VALUES (1), (-1), (NULL)) AS tristatebooleans(bool) WHERE NOT bool IS NULL;
""", {1, -1})

Expand All @@ -130,10 +131,22 @@
# fmt:on


def process_set(set_with_nan):
has_nan = any(item != item for item in set_with_nan) # Check for NaN using NaN's property
set_without_nan = {
item for item in set_with_nan if item == item
} # Create a new set without NaNs
return has_nan, set_without_nan


def compare_sets(set1, set2):
if not set1 and not set2:
return True
return set1 == set2

s1_nan, s1_no_nan = process_set(set1)
s2_nan, s2_no_nan = process_set(set2)

return s1_nan == s2_nan and s1_no_nan == s2_no_nan


@pytest.mark.parametrize("statement, expected_result", STATEMENTS)
Expand Down
6 changes: 3 additions & 3 deletions tests/sql_battery/test_results_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ def test_results_tests(test):
cursor.execute(sql)
result = cursor.arrow().to_pydict()

printable_result = orjson.dumps(result, default=str).decode()
printable_expected = orjson.dumps(test["result"]).decode()
printable_result = orjson.dumps(result, default=str, option=orjson.OPT_SORT_KEYS).decode()
printable_expected = orjson.dumps(test["result"], option=orjson.OPT_SORT_KEYS).decode()

assert (
result == test["result"]
printable_result == printable_expected
), f"Outcome:\n{printable_result}\nExpected:\n{printable_expected}"


Expand Down
4 changes: 2 additions & 2 deletions tests/sql_battery/tests/results/complex_003.results_tests
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"statement": "SELECT pl.name AS planet_name, pl.orbital_period, pl.diameter, dense_moons_stats.total_dense_moons, dense_moons_stats.avg_density, bright_moons_stats.avg_magnitude, bright_moons_stats.total_bright_moons FROM $planets pl LEFT JOIN (SELECT planetId, COUNT(*) AS total_dense_moons, AVG(density) AS avg_density FROM $satellites WHERE density > 2 GROUP BY planetId) dense_moons_stats ON pl.id = dense_moons_stats.planetId LEFT JOIN (SELECT planetId, AVG(magnitude) AS avg_magnitude, COUNT(*) AS total_bright_moons FROM $satellites WHERE magnitude < 5 GROUP BY planetId) bright_moons_stats ON pl.id = bright_moons_stats.planetId WHERE pl.distance_from_sun BETWEEN 100 AND 200 AND pl.orbital_eccentricity < 0.1 ORDER BY dense_moons_stats.total_dense_moons DESC, bright_moons_stats.avg_magnitude ASC LIMIT 10;",
"result": {
"bright_moons_stats.avg_magnitude": [-12.74, null],
"bright_moons_stats.total_bright_moons": [1, null],
"dense_moons_stats.total_dense_moons": [1, null],
"bright_moons_stats.total_bright_moons": [1.0, null],
"dense_moons_stats.total_dense_moons": [1.0, null],
"dense_moons_stats.avg_density": [3.344, null],
"planet_name": ["Earth", "Venus"],
"pl.diameter": [12756, 12104],
Expand Down

0 comments on commit 80a94f7

Please sign in to comment.