Merge pull request #508 from mabel-dev/FIX/#505

FIX/#505
mabel-dev · Sep 11, 2022 · 17e49ed · 17e49ed
2 parents 7275c84 + 2f33f45
commit 17e49ed
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 12 deletions.
diff --git a/blog/20220901 Lessons Learnt so Far.md b/blog/20220901 Lessons Learnt so Far.md
@@ -4,4 +4,16 @@
 - Unit testing is fine, but write hundreds of tests cases which run real SQL queries
 - You can't fabricate test data for all your test scenarios
 - Storage read speed will kill any performance boosts from algorithmic improvements
-- If you don't control the writing of the data - assume the worst
+- If you don't control the writing of the data - assume the worst
+
+
+PyArrow is awesome, but it has bugs, odd limitations and some parts are so slow it hurts.
+
+bugs
+- date diff just doesn't work for dates
+
+off limitations
+- can't join on tables with arrays or structs
+
+so slow it hurts
+- the file system abstractions are 4x slower than the next slowest ways to access S3 I've tried
diff --git a/opteryx/connectors/__init__.py b/opteryx/connectors/__init__.py
@@ -21,7 +21,6 @@
 from opteryx.connectors.gcp_cloudstorage_connector import GcpCloudStorageConnector
 from opteryx.connectors.mongodb_connector import MongoDbConnector
 
-
 WELL_KNOWN_ADAPTERS = {
     "disk": DiskConnector,
     "gcs": GcpCloudStorageConnector,

diff --git a/opteryx/models/columns.py b/opteryx/models/columns.py
@@ -164,7 +164,7 @@ def get_column_from_alias(self, column, only_one: bool = False):
         matches = []
         for col, att in self._column_metadata.items():
             matches.extend([col for alias in att.get("aliases", []) if alias == column])
-            matches = list(set(matches))
+        matches = list(set(matches))
         if only_one:
             if len(matches) == 0:
 

diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py
@@ -150,7 +150,7 @@ def _build_aggs(aggregators, columns):
                     f"{aggregator.value.upper()}({display_field})"
                 ] = f"{field_name}_{function}".replace("_hash_", "_")
 
-    return column_map, aggs
+    return column_map, list(set(aggs))
 
 
 def _non_group_aggregates(aggregates, table, columns):

diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py
@@ -25,7 +25,7 @@
 
 from pyarrow import Table, concat_tables
 
-from opteryx.exceptions import SqlError
+from opteryx.exceptions import ColumnNotFoundError, SqlError
 from opteryx.managers.expression import format_expression
 from opteryx.managers.expression import NodeType
 from opteryx.models import Columns, QueryDirectives, QueryStatistics
@@ -109,14 +109,19 @@ def execute(self) -> Iterable:
                         )
                     )
                 else:
-                    self._mapped_order.append(
-                        (
-                            columns.get_column_from_alias(
-                                format_expression(column), only_one=True
-                            ),
-                            direction,
+                    try:
+                        self._mapped_order.append(
+                            (
+                                columns.get_column_from_alias(
+                                    format_expression(column), only_one=True
+                                ),
+                                direction,
+                            )
+                        )
+                    except ColumnNotFoundError as cnfe:
+                        raise ColumnNotFoundError(
+                            f"`ORDER BY` must reference columns as they appear in the `SELECT` clause. {cnfe}"
                         )
-                    )
 
         table = table.sort_by(self._mapped_order)
         self._statistics.time_ordering = time.time_ns() - start_time