diff --git a/blog/20220901 Lessons Learnt so Far.md b/blog/20220901 Lessons Learnt so Far.md index 98c134280..b13ad5548 100644 --- a/blog/20220901 Lessons Learnt so Far.md +++ b/blog/20220901 Lessons Learnt so Far.md @@ -4,4 +4,16 @@ - Unit testing is fine, but write hundreds of tests cases which run real SQL queries - You can't fabricate test data for all your test scenarios - Storage read speed will kill any performance boosts from algorithmic improvements -- If you don't control the writing of the data - assume the worst \ No newline at end of file +- If you don't control the writing of the data - assume the worst + + +PyArrow is awesome, but it has bugs, odd limitations and some parts are so slow it hurts. + +bugs +- date diff just doesn't work for dates + +off limitations +- can't join on tables with arrays or structs + +so slow it hurts +- the file system abstractions are 4x slower than the next slowest ways to access S3 I've tried \ No newline at end of file diff --git a/opteryx/connectors/__init__.py b/opteryx/connectors/__init__.py index 6e0793e08..990181cc4 100644 --- a/opteryx/connectors/__init__.py +++ b/opteryx/connectors/__init__.py @@ -21,7 +21,6 @@ from opteryx.connectors.gcp_cloudstorage_connector import GcpCloudStorageConnector from opteryx.connectors.mongodb_connector import MongoDbConnector - WELL_KNOWN_ADAPTERS = { "disk": DiskConnector, "gcs": GcpCloudStorageConnector, diff --git a/opteryx/models/columns.py b/opteryx/models/columns.py index a529512ed..7ca790912 100644 --- a/opteryx/models/columns.py +++ b/opteryx/models/columns.py @@ -164,7 +164,7 @@ def get_column_from_alias(self, column, only_one: bool = False): matches = [] for col, att in self._column_metadata.items(): matches.extend([col for alias in att.get("aliases", []) if alias == column]) - matches = list(set(matches)) + matches = list(set(matches)) if only_one: if len(matches) == 0: diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 678a872f4..7166ad1ab 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -150,7 +150,7 @@ def _build_aggs(aggregators, columns): f"{aggregator.value.upper()}({display_field})" ] = f"{field_name}_{function}".replace("_hash_", "_") - return column_map, aggs + return column_map, list(set(aggs)) def _non_group_aggregates(aggregates, table, columns): diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index e35ab4c2a..5b1fbca8f 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -25,7 +25,7 @@ from pyarrow import Table, concat_tables -from opteryx.exceptions import SqlError +from opteryx.exceptions import ColumnNotFoundError, SqlError from opteryx.managers.expression import format_expression from opteryx.managers.expression import NodeType from opteryx.models import Columns, QueryDirectives, QueryStatistics @@ -109,14 +109,19 @@ def execute(self) -> Iterable: ) ) else: - self._mapped_order.append( - ( - columns.get_column_from_alias( - format_expression(column), only_one=True - ), - direction, + try: + self._mapped_order.append( + ( + columns.get_column_from_alias( + format_expression(column), only_one=True + ), + direction, + ) + ) + except ColumnNotFoundError as cnfe: + raise ColumnNotFoundError( + f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}" ) - ) table = table.sort_by(self._mapped_order) self._statistics.time_ordering = time.time_ns() - start_time