Skip to content

Commit

Permalink
Merge pull request #2224 from mabel-dev/#2223
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Jan 8, 2025
2 parents f4a7a5b + be4b356 commit 6afd792
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 5 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 971
__build__ = 973

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
13 changes: 10 additions & 3 deletions opteryx/operators/aggregate_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,25 @@ def _is_count_star(aggregates):


def _count_star(morsel_promise, column_name):
count = sum(morsel.num_rows for morsel in morsel_promise)
count = 0
for morsel in morsel_promise:
if "$COUNT(*)" in morsel.column_names:
count += morsel["$COUNT(*)"].to_numpy()[0]
else:
count += morsel.num_rows
table = pyarrow.Table.from_pylist([{column_name: count}])
return table


def project(table: pyarrow.Table, column_names: list) -> pyarrow.Table:
row_count = table.num_rows
if "$COUNT(*)" in table.column_names:
column_names.append("$COUNT(*)")
if len(column_names) > 0:
return table.select(dict.fromkeys(column_names))
else:
# if we can't find the column, add a placeholder column
return pyarrow.Table.from_pydict({"*": numpy.full(row_count, 1, dtype=numpy.int8)})
row_count = table.num_rows
return pyarrow.Table.from_pydict({"*": numpy.ones(row_count, dtype=numpy.int8)})


def build_aggregations(aggregators):
Expand Down
3 changes: 2 additions & 1 deletion opteryx/operators/base_plan_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,11 @@ def __call__(self, morsel: pyarrow.Table, join_leg: str) -> Optional[pyarrow.Tab

# if we get empty sets, don't yield them unless they're the only one
if result.num_rows > 0:
self.statistics.avoided_empty_morsels += 1
at_least_one = True
yield result
continue
else:
self.statistics.avoided_empty_morsels += 1

yield result

Expand Down
2 changes: 2 additions & 0 deletions opteryx/operators/read_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def struct_to_jsonb(table: pyarrow.Table) -> pyarrow.Table:


def normalize_morsel(schema: RelationSchema, morsel: pyarrow.Table) -> pyarrow.Table:
if morsel.column_names == ["$COUNT(*)"]:
return morsel
if len(schema.columns) == 0 and morsel.column_names != ["*"]:
one_column = pyarrow.array([True] * morsel.num_rows, type=pyarrow.bool_())
morsel = morsel.append_column("*", one_column)
Expand Down
6 changes: 6 additions & 0 deletions opteryx/utils/file_decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@ def parquet_decoder(
if not selected_columns and not force_read:
selected_columns = []

# If it's COUNT(*), we don't need to create a full dataset
# We have a handler later to sum up the $COUNT(*) column
if projection == [] and selection == []:
table = pyarrow.Table.from_arrays([[parquet_file.metadata.num_rows]], names=["$COUNT(*)"])
return (parquet_file.metadata.num_rows, parquet_file.metadata.num_columns, table)

# Read the parquet table with the optimized column list and selection filters
table = parquet.read_table(
stream,
Expand Down
4 changes: 4 additions & 0 deletions tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
("SELECT * FROM testdata.satellites", 177, 8, None),
("SELECT * FROM testdata.planets", 9, 20, None),

("SELECT COUNT(*) FROM testdata.missions", 1, 1, None),
("SELECT COUNT(*) FROM testdata.satellites", 1, 1, None),
("SELECT COUNT(*) FROM testdata.planets", 1, 1, None),

# Does the error tester work
("THIS IS NOT VALID SQL", None, None, SqlError),

Expand Down

0 comments on commit 6afd792

Please sign in to comment.