Skip to content

Commit

Permalink
FIX/#505
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer committed Sep 11, 2022
1 parent 7275c84 commit 2f33f45
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 12 deletions.
14 changes: 13 additions & 1 deletion blog/20220901 Lessons Learnt so Far.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,16 @@
- Unit testing is fine, but write hundreds of tests cases which run real SQL queries
- You can't fabricate test data for all your test scenarios
- Storage read speed will kill any performance boosts from algorithmic improvements
- If you don't control the writing of the data - assume the worst
- If you don't control the writing of the data - assume the worst


PyArrow is awesome, but it has bugs, odd limitations and some parts are so slow it hurts.

bugs
- date diff just doesn't work for dates

off limitations
- can't join on tables with arrays or structs

so slow it hurts
- the file system abstractions are 4x slower than the next slowest ways to access S3 I've tried
1 change: 0 additions & 1 deletion opteryx/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from opteryx.connectors.gcp_cloudstorage_connector import GcpCloudStorageConnector
from opteryx.connectors.mongodb_connector import MongoDbConnector


WELL_KNOWN_ADAPTERS = {
"disk": DiskConnector,
"gcs": GcpCloudStorageConnector,
Expand Down
2 changes: 1 addition & 1 deletion opteryx/models/columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def get_column_from_alias(self, column, only_one: bool = False):
matches = []
for col, att in self._column_metadata.items():
matches.extend([col for alias in att.get("aliases", []) if alias == column])
matches = list(set(matches))
matches = list(set(matches))
if only_one:
if len(matches) == 0:

Expand Down
2 changes: 1 addition & 1 deletion opteryx/operators/aggregate_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _build_aggs(aggregators, columns):
f"{aggregator.value.upper()}({display_field})"
] = f"{field_name}_{function}".replace("_hash_", "_")

return column_map, aggs
return column_map, list(set(aggs))


def _non_group_aggregates(aggregates, table, columns):
Expand Down
21 changes: 13 additions & 8 deletions opteryx/operators/sort_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from pyarrow import Table, concat_tables

from opteryx.exceptions import SqlError
from opteryx.exceptions import ColumnNotFoundError, SqlError
from opteryx.managers.expression import format_expression
from opteryx.managers.expression import NodeType
from opteryx.models import Columns, QueryDirectives, QueryStatistics
Expand Down Expand Up @@ -109,14 +109,19 @@ def execute(self) -> Iterable:
)
)
else:
self._mapped_order.append(
(
columns.get_column_from_alias(
format_expression(column), only_one=True
),
direction,
try:
self._mapped_order.append(
(
columns.get_column_from_alias(
format_expression(column), only_one=True
),
direction,
)
)
except ColumnNotFoundError as cnfe:
raise ColumnNotFoundError(
f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}"
)
)

table = table.sort_by(self._mapped_order)
self._statistics.time_ordering = time.time_ns() - start_time
Expand Down

0 comments on commit 2f33f45

Please sign in to comment.