diff --git a/Presentations/202204 Introduction to Query Engines.md b/blog/202204 Introduction to Query Engines.md similarity index 100% rename from Presentations/202204 Introduction to Query Engines.md rename to blog/202204 Introduction to Query Engines.md diff --git a/Presentations/202204 Introduction to Query Engines.pdf b/blog/202204 Introduction to Query Engines.pdf similarity index 100% rename from Presentations/202204 Introduction to Query Engines.pdf rename to blog/202204 Introduction to Query Engines.pdf diff --git a/docs/Contributor Guide/01 Guide.md b/docs/Contributor Guide/01 Guide.md index 9d9181173..1a2313a83 100644 --- a/docs/Contributor Guide/01 Guide.md +++ b/docs/Contributor Guide/01 Guide.md @@ -2,7 +2,7 @@ **Welcome to the Opteryx Contributor Guide.** -In this section you will find information to help you to bring your unique skills and experience and join us in building +In this section you will find information to help you to bring your unique skills and experience and join us in building Opteryx. Opteryx is primarily written in Python, but you don't need to be a Python developer to contribute. All contributions, [bug reports](https://github.com/mabel-dev/opteryx/issues/new/choose), documentation improvements, and [ideas](https://github.com/mabel-dev/opteryx/discussions) are welcome. diff --git a/docs/Contributor Guide/95 Project Structure.md b/docs/Contributor Guide/95 Project Structure.md index 31e63bea8..650051fac 100644 --- a/docs/Contributor Guide/95 Project Structure.md +++ b/docs/Contributor Guide/95 Project Structure.md @@ -9,8 +9,9 @@ Opteryx's repository folder structure is described below: ├── connectors/ <- modules to connect to data sources ├── functions/ <- modules to execute functions within SQL statements ├── managers/ <- libraries responsible for key functional units - │ ├── cache/ <- modules implementing the caching mechanism +mechanism │ ├── expression/ <- modules implementing expression evaluation +│ ├── kvstore/ <- modules implementing interfacing with KV Stores (internal usage) │ ├── process/ <- modules implementing process management │ ├── query/ │ │ └── planner/ <- modules implementing query planning diff --git a/docs/Deployment/Internals/Query Engine.md b/docs/Deployment/Internals/Query Engine.md index aad90bdae..12d0ea21f 100644 --- a/docs/Deployment/Internals/Query Engine.md +++ b/docs/Deployment/Internals/Query Engine.md @@ -2,7 +2,6 @@ > If you are interested in how databases work, I recommend the resources from [The CMU Database Group](https://db.cs.cmu.edu/) and the collection of resources at [Awesome Database Learning](https://github.com/pingcap/awesome-database-learning). - The Opteryx query engine has the following key components and general process: 1) The Parser & Lexer, which recieves the user SQL and builds an Abstract Syntax Tree (AST). diff --git a/docs/Features/Command Line.md b/docs/Features/Command Line.md new file mode 100644 index 000000000..6e5b568bc --- /dev/null +++ b/docs/Features/Command Line.md @@ -0,0 +1,12 @@ +# Command Line Interface + +Opteryx supports being used on the command line to process files using SQL to filter and process files. + +The full SQL feature set (e.g. time travel, caching) is available using the CLI. + +The CLI can be used to be used to save the results to output files for onward processing: + +- direct to the console +- CSV +- PARQUET +- JSONL diff --git a/docs/Release Notes/Change Log.md b/docs/Release Notes/Change Log.md index 69b827745..5b0fa0542 100644 --- a/docs/Release Notes/Change Log.md +++ b/docs/Release Notes/Change Log.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file, where appro The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.4.0] - 2022-09-12 **Added** diff --git a/opteryx/config.py b/opteryx/config.py index ddfa3f857..cbb681feb 100644 --- a/opteryx/config.py +++ b/opteryx/config.py @@ -48,4 +48,8 @@ MAX_SIZE_SINGLE_CACHE_ITEM: int = _config.get("MAX_SIZE_SINGLE_CACHE_ITEM", 1024 * 1024) # Approximate Page Size PAGE_SIZE: int = _config.get("PAGE_SIZE", 64 * 1024 * 1024) + + +# The number of metadata cache records to hold +LOCAL_METADATA_CACHE: int = int(_config.get("LOCAL_METADATA_CACHE", 512)) # fmt:on diff --git a/opteryx/connection.py b/opteryx/connection.py index c13f6ed43..b83a86721 100644 --- a/opteryx/connection.py +++ b/opteryx/connection.py @@ -24,7 +24,7 @@ from pyarrow import Table from opteryx.exceptions import CursorInvalidStateError, ProgrammingError, SqlError -from opteryx.managers.cache import BaseBufferCache +from opteryx.managers.kvstores import BaseKeyValueStore from opteryx.models import QueryStatistics from opteryx.utils import arrow @@ -39,7 +39,7 @@ class Connection: def __init__( self, *, - cache: Optional[BaseBufferCache] = None, + cache: Optional[BaseKeyValueStore] = None, **kwargs, ): self._results = None diff --git a/opteryx/managers/cache/__init__.py b/opteryx/managers/kvstores/__init__.py similarity index 68% rename from opteryx/managers/cache/__init__.py rename to opteryx/managers/kvstores/__init__.py index dc79c8d08..b5ecf829e 100644 --- a/opteryx/managers/cache/__init__.py +++ b/opteryx/managers/kvstores/__init__.py @@ -10,6 +10,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .base_buffer_cache import BaseBufferCache -from .memcached_cache import MemcachedCache -from .memory_cache import InMemoryCache +from .base_kv_store import BaseKeyValueStore + +from .kv_firestore import FireStoreKVStore +from .kv_memory import InMemoryKVStore +from .kv_local import LocalKVStore +from .kv_memcached import MemcachedKVStore +from .kv_mongodb import MongoDbKVStore diff --git a/opteryx/managers/cache/base_buffer_cache.py b/opteryx/managers/kvstores/base_kv_store.py similarity index 83% rename from opteryx/managers/cache/base_buffer_cache.py rename to opteryx/managers/kvstores/base_kv_store.py index b2c283bb0..394f0c153 100644 --- a/opteryx/managers/cache/base_buffer_cache.py +++ b/opteryx/managers/kvstores/base_kv_store.py @@ -10,16 +10,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -This is a Base class for a caching layer. +This is a Base class for KV Value Storage adapter. -It's used by injecting an instantiated object into the Reader. If the object in None -we skip any caching, if it's set up, we use it as an aside cache. +This is used by the metadata store and in-memory buffer cache. """ import abc from typing import Optional -class BaseBufferCache(abc.ABC): +class BaseKeyValueStore(abc.ABC): """ Base class for cache objects """ diff --git a/opteryx/managers/kvstores/kv_firestore.py b/opteryx/managers/kvstores/kv_firestore.py new file mode 100644 index 000000000..c1b92a949 --- /dev/null +++ b/opteryx/managers/kvstores/kv_firestore.py @@ -0,0 +1,7 @@ +# TODO: rocks db + +from opteryx.managers.kvstores import BaseKeyValueStore + + +class FireStoreKVStore(BaseKeyValueStore): + pass diff --git a/opteryx/managers/kvstores/kv_local.py b/opteryx/managers/kvstores/kv_local.py new file mode 100644 index 000000000..3496e183f --- /dev/null +++ b/opteryx/managers/kvstores/kv_local.py @@ -0,0 +1,7 @@ +# TODO: rocks db + +from opteryx.managers.kvstores import BaseKeyValueStore + + +class LocalKVStore(BaseKeyValueStore): + pass diff --git a/opteryx/managers/cache/memcached_cache.py b/opteryx/managers/kvstores/kv_memcached.py similarity index 96% rename from opteryx/managers/cache/memcached_cache.py rename to opteryx/managers/kvstores/kv_memcached.py index d45c486d6..e8ac09f6d 100644 --- a/opteryx/managers/cache/memcached_cache.py +++ b/opteryx/managers/kvstores/kv_memcached.py @@ -16,7 +16,7 @@ import io import os from opteryx.exceptions import MissingDependencyError -from opteryx.managers.cache import BaseBufferCache +from opteryx.managers.kvstores import BaseKeyValueStore try: # added 3.9 @@ -65,7 +65,7 @@ def _memcached_server(**kwargs): ) -class MemcachedCache(BaseBufferCache): +class MemcachedKVStore(BaseKeyValueStore): """ Cache object """ diff --git a/opteryx/managers/cache/memory_cache.py b/opteryx/managers/kvstores/kv_memory.py similarity index 95% rename from opteryx/managers/cache/memory_cache.py rename to opteryx/managers/kvstores/kv_memory.py index 25c70267d..fd63d5b8e 100644 --- a/opteryx/managers/cache/memory_cache.py +++ b/opteryx/managers/kvstores/kv_memory.py @@ -21,10 +21,10 @@ import io -from opteryx.managers.cache import BaseBufferCache +from opteryx.managers.kvstores import BaseKeyValueStore -class InMemoryCache(BaseBufferCache): +class InMemoryKVStore(BaseKeyValueStore): def __init__(self, **kwargs): """ Parameters: diff --git a/opteryx/managers/kvstores/kv_mongodb.py b/opteryx/managers/kvstores/kv_mongodb.py new file mode 100644 index 000000000..29d7a78c1 --- /dev/null +++ b/opteryx/managers/kvstores/kv_mongodb.py @@ -0,0 +1,7 @@ +# TODO: rocks db + +from opteryx.managers.kvstores import BaseKeyValueStore + + +class MongoDbKVStore(BaseKeyValueStore): + pass diff --git a/opteryx/managers/query/planner/planner.py b/opteryx/managers/query/planner/planner.py index 729194ae0..2b379f3ea 100644 --- a/opteryx/managers/query/planner/planner.py +++ b/opteryx/managers/query/planner/planner.py @@ -55,7 +55,7 @@ from opteryx.managers.expression import NodeType from opteryx.managers.query.planner.temporal import extract_temporal_filters from opteryx.managers.query.planner import builders -from opteryx.models import Columns, ExecutionTree, QueryDirectives +from opteryx.models import Columns, ExecutionTree, QueryProperties from opteryx.utils import fuzzy_search @@ -71,7 +71,7 @@ class QueryPlanner(ExecutionTree): - def __init__(self, statistics, cache=None, directives=None): + def __init__(self, statistics, cache=None, properties=None): """ Planner creates a plan (Execution Tree or DAG) which presents the plan to respond to the query. @@ -81,10 +81,10 @@ def __init__(self, statistics, cache=None, directives=None): self._ast = None self._statistics = statistics - self._directives = ( - QueryDirectives() - if not isinstance(directives, QueryDirectives) - else directives + self._properties = ( + QueryProperties() + if not isinstance(properties, QueryProperties) + else properties ) self._cache = cache @@ -125,16 +125,22 @@ def create_plan(self, sql: str = None, ast: dict = None): else: self._ast = ast + # step one to allowing multiple plans - don't reference + # the array of ASTs in the plan + ast = self._ast[0] + # build a plan for the query - if "Query" in self._ast[0]: + if "Query" in ast: self._naive_select_planner(self._ast, self._statistics) - elif "Explain" in self._ast[0]: + elif "Explain" in ast: self._explain_planner(self._ast, self._statistics) - elif "ShowColumns" in self._ast[0]: + # elif "SetVariable" is self._ast[0]: + # self._set_variable_planner(self._ast, self._statistics) + elif "ShowColumns" in ast: self._show_columns_planner(self._ast, self._statistics) - elif "ShowVariable" in self._ast[0]: + elif "ShowVariable" in ast: self._show_variable_planner(self._ast, self._statistics) - elif "ShowCreate" in self._ast[0]: + elif "ShowCreate" in ast: self._show_create_planner(self._ast, self._statistics) else: # pragma: no cover raise SqlError("Unknown or unsupported Query type.") @@ -638,21 +644,21 @@ def _extract_having(self, ast): having = ast[0]["Query"]["body"]["Select"]["having"] return self._filter_extract(having) - def _extract_directives(self, ast): - return QueryDirectives() + def _extract_properties(self, ast): + return QueryProperties() def _explain_planner(self, ast, statistics): - directives = self._extract_directives(ast) + properties = self._extract_properties(ast) explain_plan = self.copy() explain_plan.create_plan(ast=[ast[0]["Explain"]["statement"]]) explain_node = operators.ExplainNode( - directives, statistics, query_plan=explain_plan + properties, statistics, query_plan=explain_plan ) self.add_operator("explain", explain_node) def _show_columns_planner(self, ast, statistics): - directives = self._extract_directives(ast) + properties = self._extract_properties(ast) dataset = ".".join( [part["value"] for part in ast[0]["ShowColumns"]["table_name"]] @@ -668,7 +674,7 @@ def _show_columns_planner(self, ast, statistics): self.add_operator( "reader", operators.reader_factory(mode)( - directives=directives, + properties=properties, statistics=statistics, dataset=dataset, alias=None, @@ -685,7 +691,7 @@ def _show_columns_planner(self, ast, statistics): self.add_operator( "filter", operators.ColumnSelectionNode( - directives=directives, statistics=statistics, filter=filters + properties=properties, statistics=statistics, filter=filters ), ) self.link_operators(last_node, "filter") @@ -694,7 +700,7 @@ def _show_columns_planner(self, ast, statistics): self.add_operator( "columns", operators.ShowColumnsNode( - directives=directives, + properties=properties, statistics=statistics, full=ast[0]["ShowColumns"]["full"], extended=ast[0]["ShowColumns"]["extended"], @@ -705,7 +711,7 @@ def _show_columns_planner(self, ast, statistics): def _show_create_planner(self, ast, statistics): - directives = self._extract_directives(ast) + properties = self._extract_properties(ast) if ast[0]["ShowCreate"]["obj_type"] != "Table": raise SqlError("SHOW CREATE only supports tables") @@ -722,7 +728,7 @@ def _show_create_planner(self, ast, statistics): self.add_operator( "reader", operators.reader_factory(mode)( - directives=directives, + properties=properties, statistics=statistics, dataset=dataset, alias=None, @@ -737,7 +743,7 @@ def _show_create_planner(self, ast, statistics): self.add_operator( "show_create", operators.ShowCreateNode( - directives=directives, statistics=statistics, table=dataset + properties=properties, statistics=statistics, table=dataset ), ) self.link_operators(last_node, "show_create") @@ -772,7 +778,7 @@ def _show_variable_planner(self, ast, statistics): The last word is the variable, preceeding words are modifiers. """ - directives = self._extract_directives(ast) + properties = self._extract_properties(ast) keywords = [ value["value"].upper() for value in ast[0]["ShowVariable"]["variable"] @@ -780,7 +786,7 @@ def _show_variable_planner(self, ast, statistics): if keywords[-1] == "FUNCTIONS": show_node = "show_functions" node = operators.ShowFunctionsNode( - directives=directives, + properties=properties, statistics=statistics, ) self.add_operator(show_node, operator=node) @@ -790,7 +796,7 @@ def _show_variable_planner(self, ast, statistics): name_column = ExpressionTreeNode(NodeType.IDENTIFIER, value="name") order_by_node = operators.SortNode( - directives=directives, + properties=properties, statistics=statistics, order=[([name_column], "ascending")], ) @@ -804,7 +810,7 @@ def _naive_select_planner(self, ast, statistics): The goal here is to create a plan to respond to the user, it creates has no clever tricks to improve performance. """ - directives = self._extract_directives(ast) + properties = self._extract_properties(ast) all_identifiers = self._extract_identifiers(ast) _relations = [r for r in self._extract_relations(ast)] @@ -823,7 +829,7 @@ def _naive_select_planner(self, ast, statistics): self.add_operator( "from", operators.reader_factory(mode)( - directives=directives, + properties=properties, statistics=statistics, alias=alias, dataset=dataset, @@ -868,7 +874,7 @@ def _naive_select_planner(self, ast, statistics): # Otherwise, the right table needs to come from the Reader right = operators.reader_factory(mode)( - directives=directives, + properties=properties, statistics=statistics, dataset=dataset, alias=right[0], @@ -886,7 +892,7 @@ def _naive_select_planner(self, ast, statistics): self.add_operator( f"join-{join_id}", join_node( - directives=directives, + properties=properties, statistics=statistics, join_type=join_type, join_on=join_on, @@ -904,7 +910,7 @@ def _naive_select_planner(self, ast, statistics): if _selection: self.add_operator( "where", - operators.SelectionNode(directives, statistics, filter=_selection), + operators.SelectionNode(properties, statistics, filter=_selection), ) self.link_operators(last_node, "where") last_node = "where" @@ -935,7 +941,7 @@ def _naive_select_planner(self, ast, statistics): self.add_operator( "agg", operators.AggregateNode( - directives, statistics, aggregates=_aggregates, groups=_groups + properties, statistics, aggregates=_aggregates, groups=_groups ), ) self.link_operators(last_node, "agg") @@ -945,7 +951,7 @@ def _naive_select_planner(self, ast, statistics): if _having: self.add_operator( "having", - operators.SelectionNode(directives, statistics, filter=_having), + operators.SelectionNode(properties, statistics, filter=_having), ) self.link_operators(last_node, "having") last_node = "having" @@ -961,7 +967,7 @@ def _naive_select_planner(self, ast, statistics): self.add_operator( "select", operators.ProjectionNode( - directives, statistics, projection=_projection + properties, statistics, projection=_projection ), ) self.link_operators(last_node, "select") @@ -970,7 +976,7 @@ def _naive_select_planner(self, ast, statistics): _distinct = self._extract_distinct(ast) if _distinct: self.add_operator( - "distinct", operators.DistinctNode(directives, statistics) + "distinct", operators.DistinctNode(properties, statistics) ) self.link_operators(last_node, "distinct") last_node = "distinct" @@ -978,7 +984,7 @@ def _naive_select_planner(self, ast, statistics): _order = self._extract_order(ast) if _order: self.add_operator( - "order", operators.SortNode(directives, statistics, order=_order) + "order", operators.SortNode(properties, statistics, order=_order) ) self.link_operators(last_node, "order") last_node = "order" @@ -986,7 +992,7 @@ def _naive_select_planner(self, ast, statistics): _offset = self._extract_offset(ast) if _offset: self.add_operator( - "offset", operators.OffsetNode(directives, statistics, offset=_offset) + "offset", operators.OffsetNode(properties, statistics, offset=_offset) ) self.link_operators(last_node, "offset") last_node = "offset" @@ -995,7 +1001,7 @@ def _naive_select_planner(self, ast, statistics): # 0 limit is valid if _limit is not None: self.add_operator( - "limit", operators.LimitNode(directives, statistics, limit=_limit) + "limit", operators.LimitNode(properties, statistics, limit=_limit) ) self.link_operators(last_node, "limit") last_node = "limit" diff --git a/opteryx/models/__init__.py b/opteryx/models/__init__.py index 74fd6d2d5..4e3e7db10 100644 --- a/opteryx/models/__init__.py +++ b/opteryx/models/__init__.py @@ -12,5 +12,5 @@ from .columns import Columns from .execution_tree import ExecutionTree -from .query_directives import QueryDirectives +from .query_properties import QueryProperties from .query_statistics import QueryStatistics diff --git a/opteryx/models/query_directives.py b/opteryx/models/query_properties.py similarity index 83% rename from opteryx/models/query_directives.py rename to opteryx/models/query_properties.py index 9cec48db3..a46582b39 100644 --- a/opteryx/models/query_directives.py +++ b/opteryx/models/query_properties.py @@ -11,11 +11,11 @@ # limitations under the License. -class QueryDirectives: +class QueryProperties: """ - Hints and directives to use when executing queries. + Hints and properties to use when executing queries. """ def __init__(self): - self.disable_cache: bool = False + self.variables = {} diff --git a/opteryx/operators/aggregate_node.py b/opteryx/operators/aggregate_node.py index 7166ad1ab..97a5c9e83 100644 --- a/opteryx/operators/aggregate_node.py +++ b/opteryx/operators/aggregate_node.py @@ -32,7 +32,7 @@ from opteryx.managers.expression import NodeType from opteryx.managers.expression import evaluate_and_append from opteryx.managers.expression import format_expression -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.models.columns import Columns @@ -212,9 +212,9 @@ def _extract_functions(aggregates): class AggregateNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._aggregates = config.get("aggregates", []) diff --git a/opteryx/operators/base_plan_node.py b/opteryx/operators/base_plan_node.py index 1c6298de7..3a80477b6 100644 --- a/opteryx/operators/base_plan_node.py +++ b/opteryx/operators/base_plan_node.py @@ -13,7 +13,7 @@ import abc -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics class BasePlanNode(abc.ABC): @@ -21,7 +21,7 @@ class BasePlanNode(abc.ABC): _producers = None def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ This is the base class for nodes in the execution plan. @@ -30,7 +30,7 @@ def __init__( different nodes differently to record what happened during the query execution. """ - self._directives = directives + self._properties = properties self._statistics = statistics def __call__(self): diff --git a/opteryx/operators/blob_reader_node.py b/opteryx/operators/blob_reader_node.py index f5998f315..a756fd968 100644 --- a/opteryx/operators/blob_reader_node.py +++ b/opteryx/operators/blob_reader_node.py @@ -30,7 +30,7 @@ from opteryx.exceptions import DatabaseError from opteryx.managers.schemes import MabelPartitionScheme from opteryx.managers.schemes import DefaultPartitionScheme -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.utils import file_decoders @@ -97,13 +97,13 @@ class BlobReaderNode(BasePlanNode): _disable_cache = False def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ The Blob Reader Node is responsible for reading the relevant blobs and returning a Table/Relation. """ - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) today = datetime.datetime.utcnow().date() diff --git a/opteryx/operators/collection_reader_node.py b/opteryx/operators/collection_reader_node.py index b54dc8ce1..293da16c3 100644 --- a/opteryx/operators/collection_reader_node.py +++ b/opteryx/operators/collection_reader_node.py @@ -23,20 +23,20 @@ import pyarrow -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.models.columns import Columns class CollectionReaderNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ The Collection Reader Node is responsible for reading the relevant documents from a NoSQL document store and returning a Table/Relation. """ - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._statistics = statistics self._alias = config.get("alias") diff --git a/opteryx/operators/column_selection_node.py b/opteryx/operators/column_selection_node.py index d2fd08c1b..7ee418a37 100644 --- a/opteryx/operators/column_selection_node.py +++ b/opteryx/operators/column_selection_node.py @@ -19,16 +19,16 @@ """ from typing import Iterable -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.exceptions import SqlError class ColumnSelectionNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._filter = config.get("filter", True) @property diff --git a/opteryx/operators/cross_join_node.py b/opteryx/operators/cross_join_node.py index 28a54b7b2..1223f494a 100644 --- a/opteryx/operators/cross_join_node.py +++ b/opteryx/operators/cross_join_node.py @@ -26,7 +26,7 @@ from opteryx import config from opteryx.exceptions import SqlError from opteryx.managers.expression import NodeType -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode @@ -188,9 +188,9 @@ class CrossJoinNode(BasePlanNode): """ def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._right_table = config.get("right_table") self._join_type = config.get("join_type", "CrossJoin") diff --git a/opteryx/operators/distinct_node.py b/opteryx/operators/distinct_node.py index 3b0636c0a..71ecc2046 100644 --- a/opteryx/operators/distinct_node.py +++ b/opteryx/operators/distinct_node.py @@ -22,16 +22,16 @@ from pyarrow import Table, concat_tables from opteryx.exceptions import SqlError -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.third_party.pyarrow_ops import drop_duplicates class DistinctNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._distinct = config.get("distinct", True) @property diff --git a/opteryx/operators/explain_node.py b/opteryx/operators/explain_node.py index a1a1779e5..fb58914a5 100644 --- a/opteryx/operators/explain_node.py +++ b/opteryx/operators/explain_node.py @@ -19,15 +19,15 @@ """ from typing import Iterable -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode class ExplainNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._query_plan = config.get("query_plan") @property diff --git a/opteryx/operators/function_dataset_node.py b/opteryx/operators/function_dataset_node.py index 652aabc20..2a415716b 100644 --- a/opteryx/operators/function_dataset_node.py +++ b/opteryx/operators/function_dataset_node.py @@ -25,7 +25,7 @@ import pyarrow -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.managers.expression import NodeType, evaluate from opteryx.operators import BasePlanNode from opteryx.exceptions import SqlError @@ -77,13 +77,13 @@ def _inner(rows, columns): class FunctionDatasetNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ The Blob Reader Node is responsible for reading the relevant blobs and returning a Table/Relation. """ - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._statistics = statistics self._alias = config["alias"] self._function = config["dataset"]["function"] diff --git a/opteryx/operators/inner_join_node.py b/opteryx/operators/inner_join_node.py index 455f3e73c..593531cea 100644 --- a/opteryx/operators/inner_join_node.py +++ b/opteryx/operators/inner_join_node.py @@ -25,7 +25,7 @@ from opteryx import config from opteryx.operators import BasePlanNode -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.exceptions import SqlError from opteryx.third_party import pyarrow_ops from opteryx.utils import arrow @@ -49,9 +49,9 @@ def calculate_batch_size(cardinality): class InnerJoinNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._right_table = config.get("right_table") self._join_type = config.get("join_type", "CrossJoin") self._on = config.get("join_on") diff --git a/opteryx/operators/internal_dataset_node.py b/opteryx/operators/internal_dataset_node.py index a3145218d..749b3c92b 100644 --- a/opteryx/operators/internal_dataset_node.py +++ b/opteryx/operators/internal_dataset_node.py @@ -23,7 +23,7 @@ from opteryx import samples from opteryx.exceptions import DatabaseError -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode @@ -68,13 +68,13 @@ def _get_sample_dataset(dataset, alias): class InternalDatasetNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ The Blob Reader Node is responsible for reading the relevant blobs and returning a Table/Relation. """ - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._statistics = statistics self._alias = config["alias"] diff --git a/opteryx/operators/limit_node.py b/opteryx/operators/limit_node.py index c4ccd37ae..04c90f0c0 100644 --- a/opteryx/operators/limit_node.py +++ b/opteryx/operators/limit_node.py @@ -22,15 +22,15 @@ from pyarrow import Table, concat_tables from opteryx.exceptions import SqlError -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode class LimitNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._limit = config.get("limit") @property diff --git a/opteryx/operators/offset_node.py b/opteryx/operators/offset_node.py index 16b3b62d1..3dd20485a 100644 --- a/opteryx/operators/offset_node.py +++ b/opteryx/operators/offset_node.py @@ -22,15 +22,15 @@ import pyarrow from opteryx.exceptions import SqlError -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode class OffsetNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._offset = config.get("offset") @property diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index 098f35162..bfb38edb5 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -22,7 +22,7 @@ import pyarrow from opteryx.exceptions import SqlError -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.utils import arrow @@ -35,9 +35,9 @@ class OuterJoinNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._join_type = OUTER_JOINS[config.get("join_type")] self._on = config.get("join_on") self._using = config.get("join_using") diff --git a/opteryx/operators/projection_node.py b/opteryx/operators/projection_node.py index 55d8989f7..bc3c86fb2 100644 --- a/opteryx/operators/projection_node.py +++ b/opteryx/operators/projection_node.py @@ -28,19 +28,19 @@ from opteryx.managers.expression import evaluate_and_append from opteryx.managers.expression import format_expression from opteryx.managers.expression import NodeType, LITERAL_TYPE -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.utils import random_int class ProjectionNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): """ Attribute Projection, remove unwanted columns and performs column renames. """ - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._projection: dict = {} self._expressions = [] diff --git a/opteryx/operators/selection_node.py b/opteryx/operators/selection_node.py index c4efcb2b0..1b63b7d72 100644 --- a/opteryx/operators/selection_node.py +++ b/opteryx/operators/selection_node.py @@ -27,16 +27,16 @@ from opteryx.attribute_types import TOKEN_TYPES from opteryx.exceptions import SqlError from opteryx.managers.expression import evaluate -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.utils.arrow import consolidate_pages class SelectionNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._filter = config.get("filter") self._unfurled_filter = None self._mapped_filter = None diff --git a/opteryx/operators/show_columns_node.py b/opteryx/operators/show_columns_node.py index 648c30c80..d07029b8a 100644 --- a/opteryx/operators/show_columns_node.py +++ b/opteryx/operators/show_columns_node.py @@ -30,7 +30,7 @@ from opteryx.attribute_types import OPTERYX_TYPES, determine_type from opteryx.exceptions import SqlError -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode MAX_COLLECTOR: int = 17 @@ -360,9 +360,9 @@ def _extended_collector(pages): class ShowColumnsNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._full = config.get("full") self._extended = config.get("extended") diff --git a/opteryx/operators/show_create_node.py b/opteryx/operators/show_create_node.py index b0ffb146b..aeb7ac503 100644 --- a/opteryx/operators/show_create_node.py +++ b/opteryx/operators/show_create_node.py @@ -22,16 +22,16 @@ import pyarrow from opteryx.exceptions import SqlError -from opteryx.models import QueryDirectives, QueryStatistics +from opteryx.models import QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode from opteryx.models import Columns class ShowCreateNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._table = config.get("table") @property diff --git a/opteryx/operators/show_functions_node.py b/opteryx/operators/show_functions_node.py index 3e16f5edf..c7aa53492 100644 --- a/opteryx/operators/show_functions_node.py +++ b/opteryx/operators/show_functions_node.py @@ -19,15 +19,15 @@ import pyarrow from opteryx import operators, functions -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode class ShowFunctionsNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._full = config.get("full") self._extended = config.get("extended") diff --git a/opteryx/operators/sort_node.py b/opteryx/operators/sort_node.py index 5b1fbca8f..8ce73ef96 100644 --- a/opteryx/operators/sort_node.py +++ b/opteryx/operators/sort_node.py @@ -28,15 +28,15 @@ from opteryx.exceptions import ColumnNotFoundError, SqlError from opteryx.managers.expression import format_expression from opteryx.managers.expression import NodeType -from opteryx.models import Columns, QueryDirectives, QueryStatistics +from opteryx.models import Columns, QueryProperties, QueryStatistics from opteryx.operators import BasePlanNode class SortNode(BasePlanNode): def __init__( - self, directives: QueryDirectives, statistics: QueryStatistics, **config + self, properties: QueryProperties, statistics: QueryStatistics, **config ): - super().__init__(directives=directives, statistics=statistics) + super().__init__(properties=properties, statistics=statistics) self._order = config.get("order", []) self._mapped_order: List = [] @@ -93,7 +93,7 @@ def execute(self) -> Iterable: return raise SqlError( - "ORDER BY only supports RAND() as a functional sort order." + "`ORDER BY` only supports `RAND()` as a functional sort order." ) elif column.token_type == NodeType.LITERAL_NUMERIC: diff --git a/opteryx/version.py b/opteryx/version.py index b9e552928..8919c6018 100644 --- a/opteryx/version.py +++ b/opteryx/version.py @@ -16,4 +16,5 @@ 2) we can import it in setup.py for the same reason """ -__version__ = "0.4.0-alpha.6" +# __version__ = "0.4.0-alpha.6" +__version__ = "0.4.0" diff --git a/testdata/formats/csv/tweets.csv b/testdata/formats/csv/tweets.csv index f840daf23..b2e293a70 100644 --- a/testdata/formats/csv/tweets.csv +++ b/testdata/formats/csv/tweets.csv @@ -57755,7 +57755,7 @@ FreeBSD Security Advisory FreeBSD-SA-19:17.fd https://t.co/tw2K7FRK1B",False,9.413894967713997e+17,VulmonFeeds 5d38a16e7f4a9d0198283a02,,560,,,2019-07-24 18:20:25,"CVE-2019-0220 -A vulnerability was found in Apache HTTP Server 2.4.0 to 2.4.38. When the path component of a request URL contains multiple consecutive slashes ('/'), directives such as LocationMatch and RewriteRule ... +A vulnerability was found in Apache HTTP Server 2.4.0 to 2.4.38. When the path component of a request URL contains multiple consecutive slashes ('/'), properties such as LocationMatch and RewriteRule ... https://t.co/0tSuyOOraQ",False,9.413894967713997e+17,VulmonFeeds 5d38a2d07f4a9d01982849b6,,93,,,2019-07-24 18:26:18,"If you haven't patched CVE-2019-0708 - please make it a priority - this is only a matter of time now... diff --git a/tests/performance/performance_storage_types.py b/tests/performance/performance_storage_types.py index 546c1989a..8cf9684e6 100644 --- a/tests/performance/performance_storage_types.py +++ b/tests/performance/performance_storage_types.py @@ -16,7 +16,7 @@ import opteryx from opteryx.connectors import DiskConnector -from opteryx.managers.cache import InMemoryCache +from opteryx.kvstores import InMemoryCache import time diff --git a/tests/storage/test_in_memory_cache.py b/tests/storage/test_in_memory_cache.py index e2c4ac1e1..23fe41b76 100644 --- a/tests/storage/test_in_memory_cache.py +++ b/tests/storage/test_in_memory_cache.py @@ -8,12 +8,12 @@ sys.path.insert(1, os.path.join(sys.path[0], "../..")) import opteryx -from opteryx.managers.cache import InMemoryCache +from opteryx.managers.kvstores import InMemoryKVStore def test_in_memory_cache(): - cache = InMemoryCache(size=5) + cache = InMemoryKVStore(size=5) # read the data once, this should populate the cache conn = opteryx.connect(cache=cache) diff --git a/tests/storage/test_memcached_cache.py b/tests/storage/test_memcached_cache.py index 9f59ddec1..256744722 100644 --- a/tests/storage/test_memcached_cache.py +++ b/tests/storage/test_memcached_cache.py @@ -12,9 +12,9 @@ def test_memcached_cache(): import opteryx - from opteryx.managers.cache import MemcachedCache + from opteryx.managers.kvstores import MemcachedKVStore - cache = MemcachedCache(server="localhost:11211") + cache = MemcachedKVStore(server="localhost:11211") # read the data once, this should populate the cache if it hasn't already conn = opteryx.connect(cache=cache)