From a7d8abdc840f12358fce26611b15bda0511e52ab Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 16:16:33 -0500 Subject: [PATCH 01/27] Bump pytest in requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 32fda0a..6dc0f03 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ codecov==2.0.15 Flask>=1.1,<2.0 jsonschema==3.2.0 psycopg2-binary==2.8.4 -pytest==5.3.4 +pytest==5.3.5 pytest-cov==2.8.1 pytest-django==3.8.0 python-dateutil==2.8.1 From 3fb09a02baf350b71f39c860708e5660b06f049c Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 16:40:33 -0500 Subject: [PATCH 02/27] Refactor search to allow queries on the same [item] --- chord_lib/search/data_structure.py | 205 ++++++++++++++++++++--------- tests/test_search.py | 74 ++++++++--- 2 files changed, 197 insertions(+), 82 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index b9d3a58..a180f69 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -1,8 +1,6 @@ import jsonschema -from collections.abc import Iterable -from itertools import chain from operator import and_, or_, not_, lt, le, eq, gt, ge, contains -from typing import Callable, Dict, List, Tuple, Union +from typing import Callable, Dict, List, Tuple, Union, Optional from .queries import * @@ -14,55 +12,89 @@ QueryableStructure = Union[BaseQueryableStructure, Tuple["QueryableStructure"]] BBOperator = Callable[[BaseQueryableStructure, BaseQueryableStructure], bool] +ResolveDict = Dict[str, Tuple[int, "ResolveDict"]] +IndexCombination = Dict[str, int] -def _is_gen_flatten_compatible(t): + +def _validate_data_structure_against_schema(data_structure: QueryableStructure, schema: dict): try: - iter(t) - return not isinstance(t, str) and not isinstance(t, list) and not isinstance(t, dict) - except TypeError: - return False + jsonschema.validate(data_structure, schema) + except jsonschema.ValidationError: + raise ValueError("Invalid data structure: \n{}\nFor Schema: \n{}".format(data_structure, schema)) -def generator_flatten(t) -> Iterable: - """ - Flattens a possibly nested tuple or base queryable structure into a 1-dimensional generator. - :param t: The value to flatten and or expand into a 1-dimensional generator. - :return: A generator of flattened values. - """ - return chain.from_iterable(generator_flatten(v) for v in t) if _is_gen_flatten_compatible(t) else (t,) +def _validate_not_wc(e: Expression): + if e.fn == FUNCTION_HELPER_WC: + raise NotImplementedError("Cannot use wildcard helper here") -def evaluate(ast: AST, data_structure: QueryableStructure, schema: dict, internal: bool = False, - validate: bool = True) -> QueryableStructure: +def evaluate(ast: AST, data_structure: QueryableStructure, schema: dict, + index_combination: Optional[IndexCombination], internal: bool = False, validate: bool = True)\ + -> QueryableStructure: """ Evaluates a query expression into a value, populated by a passed data structure. :param ast: A query expression. :param data_structure: A data structure from which to resolve values. :param schema: The JSON schema for data objects being queried. + :param index_combination: The combination of array indices being evaluated upon. :param internal: Whether internal-only fields are allowed to be resolved. :param validate: Whether to validate the structure against the schema. Typically called only once per evaluate. :return: A value (string, int, float, bool, array, or dict.) """ if validate: - try: - jsonschema.validate(data_structure, schema) - except jsonschema.ValidationError: - raise ValueError("Invalid data structure: \n{}\nFor Schema: \n{}".format(data_structure, schema)) + _validate_data_structure_against_schema(data_structure, schema) if isinstance(ast, Literal): return ast.value - if ast.fn == FUNCTION_HELPER_WC: - raise NotImplementedError("Cannot use wildcard helper here") + _validate_not_wc(ast) check_operation_permissions( ast, schema, - search_getter=lambda rl, s: _resolve_with_search(rl, data_structure, s, internal)[1], + search_getter=lambda rl, s: _resolve_with_properties(rl, data_structure, s, index_combination, internal)[1], internal=internal) - return QUERY_CHECK_SWITCH[ast.fn](ast.args, data_structure, schema, internal) + return QUERY_CHECK_SWITCH[ast.fn](ast.args, data_structure, schema, index_combination, internal) + + +def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: dict) -> Dict[str, Tuple[int, dict]]: + if isinstance(ast, Literal): + return {} + + _validate_not_wc(ast) + + if ast.fn == FUNCTION_RESOLVE: + r = _resolve_array_lengths(ast.args, data_structure, schema) + return {r[0]: r[1:]} if r is not None else {} + + array_lengths = {} + for e in ast.args: + al = _collect_array_lengths(e, data_structure, schema) + for k, v in al.items(): + if k not in array_lengths: + array_lengths[k] = v + else: + array_lengths[k] = (array_lengths[k][0], {**array_lengths[k][1], **v[1]}) + + return array_lengths + + +def _create_all_index_combinations(array_data: Dict[str, Tuple[int, Dict]], parent_template): + combinations = [] + + if len(array_data) == 0: + # Add in the finished list of indexes as the base case + combinations.append(parent_template) + + # Otherwise, loop through and recurse + for c_path, c_resolve in array_data.items(): + for i in range(c_resolve[0]): + item_template = {**parent_template, c_path: i} + combinations.extend(_create_all_index_combinations(c_resolve[1], item_template)) + + return tuple(combinations) # TODO: More rigorous / defined rules @@ -81,11 +113,27 @@ def check_ast_against_data_structure( :return: A boolean representing whether or not the query matches the data object. """ + _validate_data_structure_against_schema(data_structure, schema) + + # Collect all array resolves and their lengths in order to properly cross-product arrays + array_lengths = _collect_array_lengths(ast, data_structure, schema) + + # Create all combinations of indexes into arrays + index_combinations = _create_all_index_combinations(array_lengths, {}) + # TODO: What to do here? Should be standardized, esp. w/r/t False returns - return any(isinstance(e, bool) and e for e in generator_flatten(evaluate(ast, data_structure, schema, internal))) + # Loop through all combinations of array indices to freeze "[item]"s at particular indices across the whole query. + for ic in index_combinations: + # Don't need to re-validate data structure + e = evaluate(ast, data_structure, schema, ic, internal, validate=False) + if isinstance(e, bool) and e: + return True + + return False -def _binary_op(op: BBOperator) -> Callable[[List[AST], QueryableStructure, dict, bool], bool]: +def _binary_op(op: BBOperator)\ + -> Callable[[List[AST], QueryableStructure, dict, bool, Optional[IndexCombination]], bool]: """ Returns a lambda which will evaluate a boolean-returning binary operator on a pair of arguments against a data structure/object of some type and return a Boolean result. @@ -93,39 +141,64 @@ def _binary_op(op: BBOperator) -> Callable[[List[AST], QueryableStructure, dict, :return: Operator lambda for use in evaluating expressions. """ - def uncurried_binary_op(args: List[AST], ds: QueryableStructure, schema: dict, internal: bool) -> bool: + def uncurried_binary_op(args: List[AST], ds: QueryableStructure, schema: dict, ic: Optional[IndexCombination], + internal: bool) -> bool: # TODO: Standardize type safety / behaviour!!! - try: - # Either LHS or RHS could be a tuple of [item] - return any( - op(li, ri) - for li in generator_flatten(evaluate(args[0], ds, schema, internal, validate=False)) # LHS, Outer loop - for ri in generator_flatten(evaluate(args[1], ds, schema, internal, validate=False)) # RHS, Inner loop - ) + lhs = evaluate(args[0], ds, schema, ic, internal, validate=False) + rhs = evaluate(args[1], ds, schema, ic, internal, validate=False) + + try: + return op(lhs, rhs) except TypeError: - raise TypeError("Type-invalid use of binary operator {}".format(op)) + raise TypeError(f"Type-invalid use of binary operator {op} ({lhs}, {rhs})") - return lambda args, ds, schema, internal: uncurried_binary_op(args, ds, schema, internal) + return lambda args, ds, schema, ic, internal: uncurried_binary_op(args, ds, schema, ic, internal) -def preserved_tuple_apply( - v: QueryableStructure, - fn: Callable[[BaseQueryableStructure], QueryableStructure] -) -> QueryableStructure: - # TODO: Should generator_flatten be used here? - return fn(v) if not isinstance(v, tuple) else tuple(fn(d) for d in generator_flatten(v)) +def _resolve_checks(resolve, schema): + if schema["type"] not in ("object", "array"): + raise TypeError("Cannot get property of literal") + elif schema["type"] == "object" and resolve[0].value not in schema["properties"]: + raise ValueError("Property {} not found in object, {}".format(resolve[0].value, [x.value for x in resolve])) -def curried_get(k) -> Callable: - return lambda v: v[k] + elif schema["type"] == "array" and resolve[0].value != "[item]": + raise TypeError("Cannot get property of array") -def _resolve_with_search( +def _resolve_array_lengths( resolve: List[Literal], resolving_ds: QueryableStructure, schema: dict, - internal: bool = False + path="_root", +) -> Tuple[str, int, ResolveDict]: + if len(resolve) == 0: + # Resolve the root if it's an empty list + return (path, len(resolving_ds), {}) if schema["type"] == "array" else None + + _resolve_checks(resolve, schema) + + if resolve[0].value == "[item]": + resolves = {} + for i in range(len(resolving_ds)): + r = _resolve_array_lengths(resolve[1:], resolving_ds[i], schema["items"], f"{path}.{resolve[0].value}") + if r is not None and r[0] not in resolves: + resolves[r[0]] = r[1:] + + return path, len(resolving_ds), resolves + + return _resolve_array_lengths(resolve[1:], resolving_ds[resolve[0].value], schema["properties"][resolve[0].value], + f"{path}.{resolve[0].value}") + + +def _resolve_with_properties( + resolve: List[Literal], + resolving_ds: QueryableStructure, + schema: dict, + index_combination: Optional[IndexCombination], + internal: bool = False, + path="_root", ) -> Tuple[QueryableStructure, dict]: """ Resolves / evaluates a path (either object or array) into a value and its search properties. Assumes the @@ -133,41 +206,47 @@ def _resolve_with_search( :param resolve: The current path to resolve, not including the current data structure. :param resolving_ds: The data structure being resolved upon. :param schema: The JSON schema representing the resolving data structure. + :param index_combination: The combination of array indices being evaluated upon :param internal: Whether internal-only fields are allowed to be resolved. + :param path: A string representation of the path to the currently-resolved data structure. :return: The resolved value after exploring the resolve path, and the search operations that can be performed on it. """ if len(resolve) == 0: # Resolve the root if it's an empty list - return resolving_ds, schema.get("search", {}) - - if schema["type"] not in ("object", "array"): - raise TypeError("Cannot get property of literal") + return resolving_ds, schema.get("search", {}), - elif schema["type"] == "object" and resolve[0].value not in schema["properties"]: - raise ValueError("Property {} not found in object, {}".format(resolve[0].value, [x.value for x in resolve])) + _resolve_checks(resolve, schema) - elif schema["type"] == "array" and resolve[0].value != "[item]": - raise TypeError("Cannot get property of array") + if resolve[0].value == "[item]" and (index_combination is None or path not in index_combination): + # TODO: Specific exception class + raise Exception(f"Index combination not provided for path {path}") - return _resolve_with_search( + return _resolve_with_properties( resolve[1:], - preserved_tuple_apply(resolving_ds, curried_get(resolve[0].value) if schema["type"] == "object" else tuple), + resolving_ds[resolve[0].value] if schema["type"] == "object" else resolving_ds[index_combination[path]], schema["properties"][resolve[0].value] if schema["type"] == "object" else schema["items"], - internal) + index_combination, + internal, + path=f"{path}.{resolve[0]}") -def _resolve(resolve: List[Literal], resolving_ds: QueryableStructure, schema: dict, internal: bool): +def _resolve(resolve: List[Literal], resolving_ds: QueryableStructure, schema: dict, + index_combination: Optional[Dict[str, int]], internal: bool = False): """ - Does the same thing as _resolve_with_ops, but discards the search operations. + Does the same thing as _resolve_with_properties, but discards the search properties. """ - return _resolve_with_search(resolve, resolving_ds, schema, internal)[0] + return _resolve_with_properties(resolve, resolving_ds, schema, index_combination, internal)[0] -QUERY_CHECK_SWITCH: Dict[FunctionName, Callable[[List[AST], QueryableStructure, dict, bool], QueryableStructure]] = { +QUERY_CHECK_SWITCH: Dict[ + FunctionName, + Callable[[List[AST], QueryableStructure, dict, bool, Optional[IndexCombination]], QueryableStructure] +] = { FUNCTION_AND: _binary_op(and_), FUNCTION_OR: _binary_op(or_), - FUNCTION_NOT: lambda args, ds, schema, internal: not_(evaluate(args[0], ds, schema, internal, validate=False)), + FUNCTION_NOT: lambda args, ds, schema, internal, ic: not_(evaluate(args[0], ds, schema, internal, ic, + validate=False)), FUNCTION_LT: _binary_op(lt), FUNCTION_LE: _binary_op(le), diff --git a/tests/test_search.py b/tests/test_search.py index a4ba3f6..d592582 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -353,6 +353,23 @@ TEST_QUERY_10 = ["#eq", ["#resolve", "biosamples", "[item]", "test_postgres_array", "[item]", "test"], "test_value"] TEST_QUERY_11 = ["#eq", ["#resolve", "biosamples", "[item]", "test_json_array", "[item]", "test"], "test_value"] +# Test array item access - [item] at a certain path point should mean the same object across the whole query. +TEST_QUERY_12 = [ + "#and", + ["#co", ["#resolve", "biosamples", "[item]", "procedure", "code", "id"], "TEST"], + ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG4"] +] # False with TEST_DATA_1 - different [item]s! +TEST_QUERY_13 = [ + "#and", + ["#co", ["#resolve", "biosamples", "[item]", "procedure", "code", "id"], "TEST"], + ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG2"] +] # True with TEST_DATA_1 - same [item]s! +TEST_QUERY_14 = [ + "#or", + ["#co", ["#resolve", "biosamples", "[item]", "procedure", "code", "id"], "DUMMY"], + ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG2"] +] # True with TEST_DATA_1 - different [item]s but one of them is correct in both + TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? TEST_EXPR_3 = ["#resolve", "biosamples", "[item]", "procedure", "code", "id"] @@ -431,14 +448,18 @@ # Expression, Internal, Result DS_VALID_EXPRESSIONS = ( - (TEST_EXPR_1, False, TEST_EXPR_1), - (TEST_EXPR_2, False, TEST_EXPR_2), - (TEST_EXPR_3, False, ("TEST", "DUMMY")), - (TEST_EXPR_4, False, "XO"), - (TEST_EXPR_5, False, TEST_DATA_1), - (TEST_EXPR_6, False, False), - (TEST_EXPR_7, False, ("TG1", "TG2", "TG3", "TG4")), - (TEST_EXPR_8, False, TEST_DATA_1["biosamples"]), + (TEST_EXPR_1, False, TEST_EXPR_1, None), + (TEST_EXPR_2, False, TEST_EXPR_2, None), + (TEST_EXPR_3, False, "TEST", {"_root.biosamples": 0}), + (TEST_EXPR_3, False, "DUMMY", {"_root.biosamples": 1}), + (TEST_EXPR_4, False, "XO", None), + (TEST_EXPR_5, False, TEST_DATA_1, None), + (TEST_EXPR_6, False, False, None), + (TEST_EXPR_7, False, "TG1", {"_root.biosamples": 0, "_root.biosamples.[item].tumor_grade": 0}), + (TEST_EXPR_7, False, "TG2", {"_root.biosamples": 0, "_root.biosamples.[item].tumor_grade": 1}), + (TEST_EXPR_7, False, "TG3", {"_root.biosamples": 1, "_root.biosamples.[item].tumor_grade": 0}), + (TEST_EXPR_7, False, "TG4", {"_root.biosamples": 1, "_root.biosamples.[item].tumor_grade": 1}), + (TEST_EXPR_8, False, TEST_DATA_1["biosamples"], None), ) # Query, Internal, Result @@ -454,6 +475,9 @@ (TEST_QUERY_9, False, True), (TEST_QUERY_10, False, True), (TEST_QUERY_11, False, True), + (TEST_QUERY_12, False, False), + (TEST_QUERY_13, False, True), + (TEST_QUERY_14, False, True), ) # Query, Internal, Exception @@ -472,8 +496,14 @@ ) DS_INVALID_EXPRESSIONS = ( - *COMMON_INVALID_EXPRESSIONS, - (["#_wc", "v1"], False, NotImplementedError) + *((*i, None) for i in COMMON_INVALID_EXPRESSIONS), + # Missing index combinations + (TEST_EXPR_7, False, Exception, {"_root.biosamples": 0}), + (TEST_EXPR_7, False, Exception, {}), + (TEST_EXPR_7, False, Exception, None), + (["#_wc", "v1"], False, NotImplementedError, None), + (["#co", ["#resolve", "biosamples", "[item]", "procedure", "code", "id"], + ["#_wc", "v1"]], False, NotImplementedError, {"_root.biosamples": 0}) ) @@ -490,6 +520,7 @@ (TEST_QUERY_9, False, ("NCBITaxon:9606",)), (TEST_QUERY_10, False, ("test_value",)), (TEST_QUERY_11, False, ("test_value",)), + (TEST_QUERY_12, False, ("%TEST%", "TG4")), ) PG_INVALID_EXPRESSIONS = ( @@ -588,10 +619,13 @@ def test_postgres(): postgres.search_query_to_psycopg2_sql(["#resolve", "[item]"], TEST_INVALID_SCHEMA_2) for e, i, p in PG_VALID_QUERIES: - _, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) + q, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) + from psycopg2 import connect + with connect("dbname=metadata user=admin password=admin host=localhost") as conn: + print(q.as_string(conn)) assert params == p - for e, i, _v in DS_VALID_EXPRESSIONS: + for e, i, _v, _ic in DS_VALID_EXPRESSIONS: postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) for e, i, ex in PG_INVALID_EXPRESSIONS: @@ -600,25 +634,27 @@ def test_postgres(): def test_data_structure_search(): - for e, i, v in DS_VALID_EXPRESSIONS: - assert data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA) == v + for e, i, v, ic in DS_VALID_EXPRESSIONS: + print(e, i, v, ic) + assert data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, ic) == v for q, i, v in DS_VALID_QUERIES: + print("DS_VALID_QUERIES", q, v) assert data_structure.check_ast_against_data_structure(queries.convert_query_to_ast(q), TEST_DATA_1, TEST_SCHEMA, i) == v - for e, i, ex in DS_INVALID_EXPRESSIONS: + for e, i, ex, ic in DS_INVALID_EXPRESSIONS: with raises(ex): - data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, i) + data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, ic, i) # Invalid data with raises(ValueError): - data_structure.evaluate(queries.convert_query_to_ast(TEST_EXPR_1), INVALID_DATA, TEST_SCHEMA) + data_structure.evaluate(queries.convert_query_to_ast(TEST_EXPR_1), INVALID_DATA, TEST_SCHEMA, {}) # noinspection PyProtectedMember def test_check_operation_permissions(): - for e, i, _v in DS_VALID_EXPRESSIONS: + for e, i, _v, ic in DS_VALID_EXPRESSIONS: queries.check_operation_permissions( queries.convert_query_to_ast(e), - TEST_SCHEMA, search_getter=lambda rl, s: data_structure._resolve_with_search(rl, TEST_DATA_1, s)[1]) + TEST_SCHEMA, search_getter=lambda rl, s: data_structure._resolve_with_properties(rl, TEST_DATA_1, s, ic)[1]) From 08642b6f87e0b0eafd592833730bacbe48691eaf Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 16:41:44 -0500 Subject: [PATCH 03/27] Revert postgres test code that shouldn't have been committed --- tests/test_search.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index d592582..7547ed4 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -619,10 +619,7 @@ def test_postgres(): postgres.search_query_to_psycopg2_sql(["#resolve", "[item]"], TEST_INVALID_SCHEMA_2) for e, i, p in PG_VALID_QUERIES: - q, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) - from psycopg2 import connect - with connect("dbname=metadata user=admin password=admin host=localhost") as conn: - print(q.as_string(conn)) + _, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) assert params == p for e, i, _v, _ic in DS_VALID_EXPRESSIONS: From 8a52e3b613f582f50fa43a6013107ebc6fa737a5 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 16:42:06 -0500 Subject: [PATCH 04/27] Remove some debug prints --- tests/test_search.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index 7547ed4..da6dd2f 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -632,11 +632,9 @@ def test_postgres(): def test_data_structure_search(): for e, i, v, ic in DS_VALID_EXPRESSIONS: - print(e, i, v, ic) assert data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, ic) == v for q, i, v in DS_VALID_QUERIES: - print("DS_VALID_QUERIES", q, v) assert data_structure.check_ast_against_data_structure(queries.convert_query_to_ast(q), TEST_DATA_1, TEST_SCHEMA, i) == v From aa404fb837dc5c5ac02ed29dff1ae08a2702baba Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 16:45:57 -0500 Subject: [PATCH 05/27] Update werkzeug to 1.x --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6dc0f03..4efbaee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,4 @@ pytest-cov==2.8.1 pytest-django==3.8.0 python-dateutil==2.8.1 redis>=3.3,<4.0 -Werkzeug>=0.16.0,<1.0 +Werkzeug>=1.0,<2.0 diff --git a/setup.py b/setup.py index b67e19c..1b6c7bd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version="0.2.0", python_requires=">=3.6", - install_requires=["jsonschema>=3.2.0,<4", "psycopg2-binary>=2.7,<3.0", "redis>=3.3,<4.0", "Werkzeug>=0.16.0,<1.0"], + install_requires=["jsonschema>=3.2.0,<4", "psycopg2-binary>=2.7,<3.0", "redis>=3.3,<4.0", "Werkzeug>=1.0,<2.0"], extras_require={ "flask": ["Flask>=1.1,<2.0"], "django": ["Django>=2.2,<3.0", "djangorestframework>=3.10,<3.11"] From 5238884f67e4cbdd282da2bc4f381c22977bebd9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 17:06:02 -0500 Subject: [PATCH 06/27] Adjust typing input order --- chord_lib/search/data_structure.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index a180f69..2a1c89e 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -1,6 +1,6 @@ import jsonschema from operator import and_, or_, not_, lt, le, eq, gt, ge, contains -from typing import Callable, Dict, List, Tuple, Union, Optional +from typing import Callable, Dict, List, Optional, Tuple, Union from .queries import * From 71f177134f95ba62e85c2f40dd8e58c6b82e9ad9 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 17:15:48 -0500 Subject: [PATCH 07/27] Add more documentation about the search module --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index b985a63..6e51f10 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,24 @@ GA4GH-standardized schemas (possibly not exactly to spec.) `search` contains definitions, validators, and transformations for the query syntax for CHORD, as well as a transpiler to the `psycopg2` PostgreSQL IR. +The query syntax for CHORD takes advantage of JSON schemas augmented with +additional properties about the field's accessibility and, in the case of +Postgres, how the field maps to a table column (or JSON column sub-field.) + +`search.data_structure` contains code for evaluating a CHORD query against a +Python data structure. + +`search.operations` contains constants representing valid search operations one +can allow against particular fields from within an augmented JSON schema. + +`search.postgres` contains a "transpiler" from the CHORD query syntax to the +`psycopg2`-provided +[intermediate representation (IR)](https://www.psycopg.org/docs/sql.html) for +PostgreSQL, allowing safe queries against a Postgres database. + +`search.queries` provides definitions for the CHORD query AST and some helper +methods for creating and processing ASTs. + ### `utils` `utils` contains miscellaneous utilities commonly required by CHORD services. From 1723a3b86f693f2e39202d3c9b376f60b4109611 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Thu, 13 Feb 2020 17:18:24 -0500 Subject: [PATCH 08/27] Add description of auth module --- .idea/dictionaries/dlougheed.xml | 1 + README.md | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/.idea/dictionaries/dlougheed.xml b/.idea/dictionaries/dlougheed.xml index 651ae6f..7bc5a50 100644 --- a/.idea/dictionaries/dlougheed.xml +++ b/.idea/dictionaries/dlougheed.xml @@ -10,6 +10,7 @@ karyotypic lougheed microservices + nginx ngmr nots nres diff --git a/README.md b/README.md index 6e51f10..86e0447 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,12 @@ python3 -m pytest --cov=chord_lib --cov-branch ## Modules +### `auth` + +`auth` provides Python service decorators and Django / DRF backends for dealing +with the CHORD container authentication headers (derived from +`lua-resty-openidc`, set by the internal container NGINX instance.) + ### `events` `events` facilitates JSON-serialized message-passing between CHORD From 2820c7cc8fad6b0e13194b73ccc369b250d36f2f Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 10:21:26 -0500 Subject: [PATCH 09/27] Add flask error-wrapping code for registering generic handlers --- chord_lib/responses/flask_errors.py | 35 ++++++++++++++++++++++++++++- tests/test_platform_flask.py | 22 ++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/chord_lib/responses/flask_errors.py b/chord_lib/responses/flask_errors.py index eb6992b..c6d431c 100644 --- a/chord_lib/responses/flask_errors.py +++ b/chord_lib/responses/flask_errors.py @@ -1,9 +1,17 @@ +import sys +import traceback + from flask import jsonify from functools import partial +from typing import Callable + from .errors import * __all__ = [ + "flask_error_wrap_with_traceback", + "flask_error_wrap", + "flask_error", "flask_bad_request_error", @@ -16,11 +24,36 @@ ] +def flask_error_wrap_with_traceback(fn: Callable, service_name="CHORD Service") -> Callable: + """ + Function to wrap flask_* error creators with something that supports the application.register_error_handler method, + while also printing a traceback. + :param fn: The flask error-generating function to wrap + :param service_name: The name of the service (for logging purposes) + :return: The wrapped function + """ + # TODO: pass exception? + def handle_error(_e): + print(f"[{service_name}] Encountered error:", file=sys.stderr) + traceback.print_exc() + return fn() + return handle_error + + +def flask_error_wrap(fn: Callable) -> Callable: + """ + Function to wrap flask_* error creators with something that supports the application.register_error_handler method. + :param fn: The flask error-generating function to wrap + :return: The wrapped function + """ + return lambda _e: fn() + + def flask_error(code: int, *errors): return jsonify(http_error(code, *errors)), code -def _flask_error(code: int): +def _flask_error(code: int) -> Callable: return partial(flask_error, code) diff --git a/tests/test_platform_flask.py b/tests/test_platform_flask.py index df72767..bdb58d0 100644 --- a/tests/test_platform_flask.py +++ b/tests/test_platform_flask.py @@ -1,13 +1,23 @@ import chord_lib.auth.flask_decorators as fd +import chord_lib.responses.flask_errors as fe import pytest from flask import Flask +from werkzeug.exceptions import BadRequest, NotFound @pytest.fixture def flask_client(): application = Flask(__name__) + application.register_error_handler(Exception, fe.flask_error_wrap_with_traceback(fe.flask_internal_server_error)) + application.register_error_handler(BadRequest, fe.flask_error_wrap(fe.flask_bad_request_error)) + application.register_error_handler(NotFound, fe.flask_error_wrap(fe.flask_not_found_error)) + + @application.route("/500") + def r500(): + raise Exception("help") + @application.route("/test1") @fd.flask_permissions_any_user def test1(): @@ -31,6 +41,18 @@ def test_flask_forbidden_error(flask_client): # Turn CHORD permissions mode on to make sure we're getting real permissions checks fd.CHORD_PERMISSIONS = True + # non-existent endpoint + + r = flask_client.get("/non-existent") + assert r.status_code == 404 + assert r.get_json()["code"] == 404 + + # server error endpoint + + r = flask_client.get("/500") + assert r.status_code == 500 + assert r.get_json()["code"] == 500 + # /test1 r = flask_client.get("/test1") From 0c13d3fa55ccf0651a3dbf9dc5b1859b16a9929b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 10:40:50 -0500 Subject: [PATCH 10/27] Add more docs to data structure query eval --- .idea/dictionaries/dlougheed.xml | 1 + chord_lib/search/data_structure.py | 40 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/.idea/dictionaries/dlougheed.xml b/.idea/dictionaries/dlougheed.xml index 7bc5a50..6290b81 100644 --- a/.idea/dictionaries/dlougheed.xml +++ b/.idea/dictionaries/dlougheed.xml @@ -6,6 +6,7 @@ composable dateutil djangorestframework + ified jsonschema karyotypic lougheed diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index 2a1c89e..33f93ea 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -17,6 +17,13 @@ def _validate_data_structure_against_schema(data_structure: QueryableStructure, schema: dict): + """ + Validates a queryable data structure of some type against a JSON schema. This is an important validation step, + because (assuming the schema is correct) it allows methods to make more assumptions about the integrity of the + data structure while traversing it. + :param data_structure: The data structure to validate + :param schema: The JSON schema to validate the data structure against + """ try: jsonschema.validate(data_structure, schema) except jsonschema.ValidationError: @@ -24,6 +31,11 @@ def _validate_data_structure_against_schema(data_structure: QueryableStructure, def _validate_not_wc(e: Expression): + """ + The #_wc expression function is a helper for converting the queries into the Postgres IR. If we encounter this + function in a query being evaluated against a data structure, it's meaningless and should raise an error. + :param e: The expression (function) to check + """ if e.fn == FUNCTION_HELPER_WC: raise NotImplementedError("Cannot use wildcard helper here") @@ -42,33 +54,61 @@ def evaluate(ast: AST, data_structure: QueryableStructure, schema: dict, :return: A value (string, int, float, bool, array, or dict.) """ + # The validate flag is used to avoid redundantly validating the integrity of child data structures if validate: _validate_data_structure_against_schema(data_structure, schema) + # A literal (e.g. ) evaluates to its own value (5) if isinstance(ast, Literal): return ast.value + # Prevents the Postgres internal-only #_wc function from being used in expressions being evaluated against Python + # data structures. See the documentation for _validate_not_wc. _validate_not_wc(ast) + # Check that the current permissions (internal or not) allow us to perform the current operation on any resolved + # fields. Internal queries are used for joins, etc. by services, or are performed by someone with unrestricted + # access to the data. + # TODO: This could be made more granular (some people could be given access to specific objects / tables) check_operation_permissions( ast, schema, search_getter=lambda rl, s: _resolve_with_properties(rl, data_structure, s, index_combination, internal)[1], internal=internal) + # Evaluate the non-literal expression recursively. return QUERY_CHECK_SWITCH[ast.fn](ast.args, data_structure, schema, index_combination, internal) def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: dict) -> Dict[str, Tuple[int, dict]]: + """ + To evaluate a query in a manner consistent with the Postgres evaluator (and facilitate richer queries), each array + item needs to be fixed in a particular evaluation of a query that involves array accesses. This helper function + collects the lengths of arrays for each different array used in the field; it does this by traversing the data + structure. These can be later used by _create_all_index_combinations to create all possible combinations of accesses + to fix them in an evaluation run. + :param ast: The AST-ified query + :param data_structure: The FULL data structure the query is being evaluated against + :param schema: The JSON schema of the full data structure + :return: A recursive dictionary with keys being array paths and values being a tuple of (length, children dict) + """ + + # Literals are not arrays (currently), so they will not have any specified lengths if isinstance(ast, Literal): return {} + # Standard validation to prevent Postgres internal-style queries from being passed in (see _validate_not_wc docs) _validate_not_wc(ast) + # Resolves are where the magic happens w/r/t array access. Capture any array accesses with their lengths and child + # array accesses and store them in the recursive dictionary. if ast.fn == FUNCTION_RESOLVE: r = _resolve_array_lengths(ast.args, data_structure, schema) return {r[0]: r[1:]} if r is not None else {} + # If the current expression is a non-resolve function, recurse into its arguments and collect any additional array + # accesses; then, store them in the recursive dictionary. + array_lengths = {} for e in ast.args: al = _collect_array_lengths(e, data_structure, schema) From 2f0c20c0cd21a9654ee8750854ef2e84fe9b6e55 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 10:50:14 -0500 Subject: [PATCH 11/27] Switch to generator for _create_all_index_combinations --- chord_lib/search/data_structure.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index 33f93ea..ed931b8 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -122,19 +122,15 @@ def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: def _create_all_index_combinations(array_data: Dict[str, Tuple[int, Dict]], parent_template): - combinations = [] - if len(array_data) == 0: # Add in the finished list of indexes as the base case - combinations.append(parent_template) + yield parent_template # Otherwise, loop through and recurse for c_path, c_resolve in array_data.items(): for i in range(c_resolve[0]): item_template = {**parent_template, c_path: i} - combinations.extend(_create_all_index_combinations(c_resolve[1], item_template)) - - return tuple(combinations) + yield from _create_all_index_combinations(c_resolve[1], item_template) # TODO: More rigorous / defined rules From b48c7f81f7634bbcc76a9aa660592d842b665d99 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 10:53:22 -0500 Subject: [PATCH 12/27] Simplify defined types in search.data_structure --- chord_lib/search/data_structure.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index ed931b8..e11eb50 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -8,9 +8,8 @@ __all__ = ["check_ast_against_data_structure"] -BaseQueryableStructure = Union[dict, list, str, int, float, bool] -QueryableStructure = Union[BaseQueryableStructure, Tuple["QueryableStructure"]] -BBOperator = Callable[[BaseQueryableStructure, BaseQueryableStructure], bool] +QueryableStructure = Union[dict, list, str, int, float, bool] +BBOperator = Callable[[QueryableStructure, QueryableStructure], bool] ResolveDict = Dict[str, Tuple[int, "ResolveDict"]] IndexCombination = Dict[str, int] From 7d55245a861d032a024dba14f0a7316f0de0c11a Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 10:58:36 -0500 Subject: [PATCH 13/27] Split apart query/ast testing into multiple test fns --- .idea/dictionaries/dlougheed.xml | 1 + tests/test_search.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.idea/dictionaries/dlougheed.xml b/.idea/dictionaries/dlougheed.xml index 6290b81..58d48ee 100644 --- a/.idea/dictionaries/dlougheed.xml +++ b/.idea/dictionaries/dlougheed.xml @@ -20,6 +20,7 @@ phenopackets pmessage postgre + preprocessing pytest sapiens taxon diff --git a/tests/test_search.py b/tests/test_search.py index da6dd2f..43188b4 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -547,25 +547,33 @@ def test_build_search_response(): assert t >= 0 -def test_queries_and_ast(): +def test_literal_equality(): assert queries.Literal(5) == queries.Literal(5) assert queries.Literal("5") == queries.Literal("5") assert queries.Literal(True) == queries.Literal(True) assert queries.Literal(1.0) == queries.Literal(1.0) + +def test_valid_function_construction(): for f in TEST_FUNCTIONS: e = queries.Expression(fn=f[0], args=f[1:]) assert e.fn == f[0] assert str(e.args) == str(f[1:]) + +def test_invalid_function_construction(): for f in TEST_INVALID_FUNCTIONS: with raises(AssertionError): queries.Expression(fn=f[0], args=[queries.Literal(a) for a in f[1:]]) + +def test_invalid_expression_syntax(): for f in TEST_INVALID_EXPRESSION_SYNTAX: with raises(SyntaxError): queries.convert_query_to_ast(f) + +def test_invalid_literals(): for v in TEST_INVALID_LITERALS: with raises(AssertionError): queries.Literal(value=v) @@ -573,10 +581,14 @@ def test_queries_and_ast(): with raises(ValueError): queries.convert_query_to_ast(v) + +def test_query_not_preprocessing(): for b, a in TEST_REDUCE_NOTS: assert queries.convert_query_to_ast_and_preprocess(b) == \ queries.convert_query_to_ast_and_preprocess(a) + +def test_queries_and_ast(): for q, s in TEST_QUERY_STR: assert str(queries.convert_query_to_ast_and_preprocess(q)) == s From 56973e79246f70ec72e670eb95d731086d2123db Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 15:00:50 -0500 Subject: [PATCH 14/27] Attempt to fix issues with incorrect index combination outputs --- chord_lib/search/data_structure.py | 97 +++++++++++++++++++----------- tests/test_search.py | 96 ++++++++++++++++++++--------- 2 files changed, 128 insertions(+), 65 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index e11eb50..87971a4 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -1,6 +1,7 @@ import jsonschema +from itertools import chain, product from operator import and_, or_, not_, lt, le, eq, gt, ge, contains -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Iterable, Optional, Tuple, Union from .queries import * @@ -11,8 +12,8 @@ QueryableStructure = Union[dict, list, str, int, float, bool] BBOperator = Callable[[QueryableStructure, QueryableStructure], bool] -ResolveDict = Dict[str, Tuple[int, "ResolveDict"]] IndexCombination = Dict[str, int] +ArrayLengthData = Tuple[str, int, Tuple["ArrayLengthData", ...]] def _validate_data_structure_against_schema(data_structure: QueryableStructure, schema: dict): @@ -79,7 +80,7 @@ def evaluate(ast: AST, data_structure: QueryableStructure, schema: dict, return QUERY_CHECK_SWITCH[ast.fn](ast.args, data_structure, schema, index_combination, internal) -def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: dict) -> Dict[str, Tuple[int, dict]]: +def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: dict) -> Iterable[ArrayLengthData]: """ To evaluate a query in a manner consistent with the Postgres evaluator (and facilitate richer queries), each array item needs to be fixed in a particular evaluation of a query that involves array accesses. This helper function @@ -94,42 +95,54 @@ def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: # Literals are not arrays (currently), so they will not have any specified lengths if isinstance(ast, Literal): - return {} + return # Standard validation to prevent Postgres internal-style queries from being passed in (see _validate_not_wc docs) _validate_not_wc(ast) # Resolves are where the magic happens w/r/t array access. Capture any array accesses with their lengths and child - # array accesses and store them in the recursive dictionary. + # array accesses. if ast.fn == FUNCTION_RESOLVE: r = _resolve_array_lengths(ast.args, data_structure, schema) - return {r[0]: r[1:]} if r is not None else {} + if r is not None: + yield r + return # If the current expression is a non-resolve function, recurse into its arguments and collect any additional array - # accesses; then, store them in the recursive dictionary. + # accesses; construct a list of possibly redundant array accesses with the arrays' lengths. + als = list(chain.from_iterable(_collect_array_lengths(e, data_structure, schema) for e in ast.args)) + yield from (a for a in als if not any(a[0] == a2[0] and len(a[2]) < len(a2[2]) for a2 in als)) - array_lengths = {} - for e in ast.args: - al = _collect_array_lengths(e, data_structure, schema) - for k, v in al.items(): - if k not in array_lengths: - array_lengths[k] = v - else: - array_lengths[k] = (array_lengths[k][0], {**array_lengths[k][1], **v[1]}) - return array_lengths +def _dict_combine(dicts): + c = {} + for d in dicts: + c.update(d) + return c -def _create_all_index_combinations(array_data: Dict[str, Tuple[int, Dict]], parent_template): - if len(array_data) == 0: - # Add in the finished list of indexes as the base case - yield parent_template +def _create_index_combinations(array_data: ArrayLengthData, parent_template: IndexCombination) \ + -> Iterable[IndexCombination]: + for i in range(array_data[1]): + item_template = {**parent_template, array_data[0]: i} - # Otherwise, loop through and recurse - for c_path, c_resolve in array_data.items(): - for i in range(c_resolve[0]): - item_template = {**parent_template, c_path: i} - yield from _create_all_index_combinations(c_resolve[1], item_template) + if len(array_data[2]) == 0: # TODO: Don't like this logic... + yield item_template # What if this is overridden elsewhere? + continue + + # TODO: Don't like the mono-tuple-ing stuff + yield from _create_all_index_combinations((array_data[2][i],), item_template) + + +def _create_all_index_combinations(arrays_data: Iterable[ArrayLengthData], parent_template: IndexCombination) \ + -> Iterable[IndexCombination]: + # Loop through and recurse + combination_sets = (_create_index_combinations(array_data, parent_template) for array_data in arrays_data) + + # Combine index mappings from different combination sets into a final list of array index combinations + # TODO: Do we need the combination superset replacement logic still? + # combinations = [c for c in combinations if not any(c.items() < c2.items() for c2 in combinations)] + yield from map(_dict_combine, product(*combination_sets)) # TODO: More rigorous / defined rules @@ -148,6 +161,7 @@ def check_ast_against_data_structure( :return: A boolean representing whether or not the query matches the data object. """ + # Validate data structure against JSON schema here to avoid having to repetitively do it with evaluate() _validate_data_structure_against_schema(data_structure, schema) # Collect all array resolves and their lengths in order to properly cross-product arrays @@ -191,7 +205,13 @@ def uncurried_binary_op(args: List[AST], ds: QueryableStructure, schema: dict, i return lambda args, ds, schema, ic, internal: uncurried_binary_op(args, ds, schema, ic, internal) -def _resolve_checks(resolve, schema): +def _resolve_checks(resolve: List[Literal], schema: dict): + """ + Performs standard checks while going through any type of "resolve"-based function (where a #resolve call is being + processed) to prevent access errors. + :param resolve: The list of resolve terms currently being processed + :param schema: The JSON schema of the data sub-structure currently being processed + """ if schema["type"] not in ("object", "array"): raise TypeError("Cannot get property of literal") @@ -202,27 +222,34 @@ def _resolve_checks(resolve, schema): raise TypeError("Cannot get property of array") +def _get_child_resolve_array_lengths(new_resolve: List[Literal], resolving_ds: List, item_schema: dict, new_path: str) \ + -> Iterable[ArrayLengthData]: + for i in range(len(resolving_ds)): + r = _resolve_array_lengths(new_resolve, resolving_ds[i], item_schema, new_path) + # Unwrap the optional + if r is not None: + yield r + + def _resolve_array_lengths( resolve: List[Literal], resolving_ds: QueryableStructure, schema: dict, path="_root", -) -> Tuple[str, int, ResolveDict]: +) -> Optional[ArrayLengthData]: if len(resolve) == 0: # Resolve the root if it's an empty list - return (path, len(resolving_ds), {}) if schema["type"] == "array" else None + return (path, len(resolving_ds), ()) if schema["type"] == "array" else None _resolve_checks(resolve, schema) if resolve[0].value == "[item]": - resolves = {} - for i in range(len(resolving_ds)): - r = _resolve_array_lengths(resolve[1:], resolving_ds[i], schema["items"], f"{path}.{resolve[0].value}") - if r is not None and r[0] not in resolves: - resolves[r[0]] = r[1:] - - return path, len(resolving_ds), resolves + return (path, + len(resolving_ds), + tuple(_get_child_resolve_array_lengths(resolve[1:], resolving_ds, schema["items"], + f"{path}.{resolve[0].value}"))) + # Otherwise, it's an object, so keep traversing without doing anything return _resolve_array_lengths(resolve[1:], resolving_ds[resolve[0].value], schema["properties"][resolve[0].value], f"{path}.{resolve[0].value}") diff --git a/tests/test_search.py b/tests/test_search.py index 43188b4..99df0b8 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -226,7 +226,27 @@ } } } - } + }, + "test_op_1": { + "type": "array", + "items": { + "type": "number", + "search": { + "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT], + "queryable": "all" + }, + }, + }, + "test_op_2": { + "type": "array", + "items": { + "type": "number", + "search": { + "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT], + "queryable": "all" + }, + }, + }, # TODO: Metadata (one-to-one) example }, "search": { @@ -369,6 +389,7 @@ ["#co", ["#resolve", "biosamples", "[item]", "procedure", "code", "id"], "DUMMY"], ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG2"] ] # True with TEST_DATA_1 - different [item]s but one of them is correct in both +TEST_QUERY_15 = ["#gt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]] TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? @@ -427,6 +448,8 @@ "label": "Homo sapiens" } }, + "test_op_1": [5, 6, 7], + "test_op_2": [9, 10, 11], "biosamples": [ { "procedure": {"code": {"id": "TEST", "label": "TEST LABEL"}}, @@ -436,7 +459,11 @@ }, { "procedure": {"code": {"id": "DUMMY", "label": "DUMMY LABEL"}}, - "tumor_grade": [{"id": "TG3", "label": "TG3 LABEL"}, {"id": "TG4", "label": "TG4 LABEL"}], + "tumor_grade": [ + {"id": "TG3", "label": "TG3 LABEL"}, + {"id": "TG4", "label": "TG4 LABEL"}, + {"id": "TG5", "label": "TG5 LABEL"}, + ], "test_postgres_array": [{"test": "test_value"}], "test_json_array": [{"test": "test_value"}], } @@ -446,7 +473,7 @@ INVALID_DATA = [{True, False}] -# Expression, Internal, Result +# Expression, Internal, Result, Index Combination DS_VALID_EXPRESSIONS = ( (TEST_EXPR_1, False, TEST_EXPR_1, None), (TEST_EXPR_2, False, TEST_EXPR_2, None), @@ -459,45 +486,48 @@ (TEST_EXPR_7, False, "TG2", {"_root.biosamples": 0, "_root.biosamples.[item].tumor_grade": 1}), (TEST_EXPR_7, False, "TG3", {"_root.biosamples": 1, "_root.biosamples.[item].tumor_grade": 0}), (TEST_EXPR_7, False, "TG4", {"_root.biosamples": 1, "_root.biosamples.[item].tumor_grade": 1}), + (TEST_EXPR_7, False, "TG5", {"_root.biosamples": 1, "_root.biosamples.[item].tumor_grade": 2}), (TEST_EXPR_8, False, TEST_DATA_1["biosamples"], None), ) -# Query, Internal, Result +# Query, Internal, Result, Number of Index Combinations (against TEST_DATA_1) DS_VALID_QUERIES = ( - (TEST_QUERY_1, False, True), - (TEST_QUERY_2, False, True), - (TEST_QUERY_3, False, True), - (TEST_QUERY_4, False, True), - (TEST_QUERY_5, False, False), - (TEST_QUERY_6, False, False), - (TEST_QUERY_7, True, True), - (TEST_QUERY_8, False, True), - (TEST_QUERY_9, False, True), - (TEST_QUERY_10, False, True), - (TEST_QUERY_11, False, True), - (TEST_QUERY_12, False, False), - (TEST_QUERY_13, False, True), - (TEST_QUERY_14, False, True), + (TEST_QUERY_1, False, True, 1), # No index accesses + (TEST_QUERY_2, False, True, 2), # Accessing 2 biosamples + (TEST_QUERY_3, False, True, 2), # " + (TEST_QUERY_4, False, True, 2), # " + (TEST_QUERY_5, False, False, 2), # " + (TEST_QUERY_6, False, False, 1), # No index accesses + (TEST_QUERY_7, True, True, 1), # " + (TEST_QUERY_8, False, True, 1), # " + (TEST_QUERY_9, False, True, 1), # " + (TEST_QUERY_10, False, True, 2), # Accessing 2 biosamples, each with 1 test_postgres_array item + (TEST_QUERY_11, False, True, 2), # Accessing 2 biosamples, each with 1 test_json_array item + (TEST_QUERY_12, False, False, 5), # Accessing 2 biosamples, one with 2 tumor grades, the other with 3 + (TEST_QUERY_13, False, True, 5), # " + (TEST_QUERY_14, False, True, 5), # " + (TEST_QUERY_15, False, False, 9), # Accessing 3 elements in test_op_n array ) # Query, Internal, Exception COMMON_INVALID_EXPRESSIONS = ( - (INVALID_EXPR_1, False, SyntaxError), - (INVALID_EXPR_2, False, SyntaxError), - (INVALID_EXPR_3, False, TypeError), - (INVALID_EXPR_4, False, ValueError), - (INVALID_EXPR_5, False, TypeError), - (INVALID_EXPR_6, False, TypeError), - (INVALID_EXPR_7, False, SyntaxError), - (INVALID_EXPR_8, False, SyntaxError), - (INVALID_EXPR_9, False, ValueError), + (INVALID_EXPR_1, False, SyntaxError), + (INVALID_EXPR_2, False, SyntaxError), + (INVALID_EXPR_3, False, TypeError), + (INVALID_EXPR_4, False, ValueError), + (INVALID_EXPR_5, False, TypeError), + (INVALID_EXPR_6, False, TypeError), + (INVALID_EXPR_7, False, SyntaxError), + (INVALID_EXPR_8, False, SyntaxError), + (INVALID_EXPR_9, False, ValueError), (INVALID_EXPR_10, False, ValueError), - (INVALID_EXPR_11, True, ValueError), + (INVALID_EXPR_11, True, ValueError), ) +# Expression, Internal, Exception Raised, Index Combination DS_INVALID_EXPRESSIONS = ( *((*i, None) for i in COMMON_INVALID_EXPRESSIONS), - # Missing index combinations + # Missing index combinations: (TEST_EXPR_7, False, Exception, {"_root.biosamples": 0}), (TEST_EXPR_7, False, Exception, {}), (TEST_EXPR_7, False, Exception, None), @@ -642,14 +672,20 @@ def test_postgres(): postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) +# noinspection PyProtectedMember def test_data_structure_search(): for e, i, v, ic in DS_VALID_EXPRESSIONS: assert data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, ic) == v - for q, i, v in DS_VALID_QUERIES: + for q, i, v, _ni in DS_VALID_QUERIES: assert data_structure.check_ast_against_data_structure(queries.convert_query_to_ast(q), TEST_DATA_1, TEST_SCHEMA, i) == v + for q, i, _v, ni in DS_VALID_QUERIES: + als = data_structure._collect_array_lengths(queries.convert_query_to_ast(q), TEST_DATA_1, TEST_SCHEMA) + ics = tuple(data_structure._create_all_index_combinations(als, {})) + assert len(ics) == ni + for e, i, ex, ic in DS_INVALID_EXPRESSIONS: with raises(ex): data_structure.evaluate(queries.convert_query_to_ast(e), TEST_DATA_1, TEST_SCHEMA, ic, i) From 36d82b6ae3a8e9661873192653493a853523c78a Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 15:41:01 -0500 Subject: [PATCH 15/27] Fix issues with duplicate index configurations --- chord_lib/search/data_structure.py | 8 ++++++- tests/test_search.py | 34 ++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index 87971a4..6a87eea 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -111,7 +111,13 @@ def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: # If the current expression is a non-resolve function, recurse into its arguments and collect any additional array # accesses; construct a list of possibly redundant array accesses with the arrays' lengths. als = list(chain.from_iterable(_collect_array_lengths(e, data_structure, schema) for e in ast.args)) - yield from (a for a in als if not any(a[0] == a2[0] and len(a[2]) < len(a2[2]) for a2 in als)) + yield from ( + a1 for i1, a1 in enumerate(als) + if not any( + a1[0] == a2[0] and len(a1[2]) <= len(a2[2]) and i1 < i2 # Deduplicate identical or subset items + for i2, a2 in enumerate(als) + ) + ) def _dict_combine(dicts): diff --git a/tests/test_search.py b/tests/test_search.py index 99df0b8..c074a20 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -232,20 +232,30 @@ "items": { "type": "number", "search": { - "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT], + "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT, operations.SEARCH_OP_EQ], "queryable": "all" }, }, + "search": { + "database": { + "type": "jsonb" + } + } }, "test_op_2": { "type": "array", "items": { "type": "number", "search": { - "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT], + "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT, operations.SEARCH_OP_EQ], "queryable": "all" }, }, + "search": { + "database": { + "type": "jsonb" + } + } }, # TODO: Metadata (one-to-one) example }, @@ -390,6 +400,17 @@ ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG2"] ] # True with TEST_DATA_1 - different [item]s but one of them is correct in both TEST_QUERY_15 = ["#gt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]] +TEST_QUERY_16 = [ + "#and", + ["#lt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]], + ["#eq", ["#resolve", "test_op_2", "[item]"], 11], +] +TEST_QUERY_17 = [ + "#and", + ["#lt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]], + ["#eq", ["#resolve", "test_op_1", "[item]"], 7], +] +TEST_QUERY_18 = ["#and", TEST_QUERY_13, TEST_QUERY_17] TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? @@ -507,6 +528,9 @@ (TEST_QUERY_13, False, True, 5), # " (TEST_QUERY_14, False, True, 5), # " (TEST_QUERY_15, False, False, 9), # Accessing 3 elements in test_op_n array + (TEST_QUERY_16, False, True, 9), # " + (TEST_QUERY_17, False, True, 9), # " + (TEST_QUERY_18, False, True, 45), # Accessing 2 biosamples, one with 2 tumor grades, the other with 3 +PLUS+ " ) # Query, Internal, Exception @@ -551,6 +575,12 @@ (TEST_QUERY_10, False, ("test_value",)), (TEST_QUERY_11, False, ("test_value",)), (TEST_QUERY_12, False, ("%TEST%", "TG4")), + (TEST_QUERY_13, False, ("%TEST%", "TG2")), + (TEST_QUERY_14, False, ("%DUMMY%", "TG2")), + (TEST_QUERY_15, False, ()), + (TEST_QUERY_16, False, (11,)), + (TEST_QUERY_17, False, (7,)), + (TEST_QUERY_18, False, ("%TEST%", "TG2", 7)), ) PG_INVALID_EXPRESSIONS = ( From d4a0f90edd3f65a115c30e410ed833877352824e Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 15:58:17 -0500 Subject: [PATCH 16/27] Add more tests for search --- tests/test_search.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index c074a20..989ceab 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -400,17 +400,11 @@ ["#eq", ["#resolve", "biosamples", "[item]", "tumor_grade", "[item]", "id"], "TG2"] ] # True with TEST_DATA_1 - different [item]s but one of them is correct in both TEST_QUERY_15 = ["#gt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]] -TEST_QUERY_16 = [ - "#and", - ["#lt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]], - ["#eq", ["#resolve", "test_op_2", "[item]"], 11], -] -TEST_QUERY_17 = [ - "#and", - ["#lt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]], - ["#eq", ["#resolve", "test_op_1", "[item]"], 7], -] -TEST_QUERY_18 = ["#and", TEST_QUERY_13, TEST_QUERY_17] +TEST_QUERY_16 = ["#lt", ["#resolve", "test_op_1", "[item]"], ["#resolve", "test_op_2", "[item]"]] +TEST_QUERY_17 = ["#and", TEST_QUERY_16, ["#eq", ["#resolve", "test_op_2", "[item]"], 11]] +TEST_QUERY_18 = ["#and", TEST_QUERY_16, ["#eq", ["#resolve", "test_op_1", "[item]"], 7]] +TEST_QUERY_19 = ["#and", TEST_QUERY_13, TEST_QUERY_17] +TEST_QUERY_20 = ["#and", TEST_QUERY_13, TEST_QUERY_18] TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? @@ -530,7 +524,9 @@ (TEST_QUERY_15, False, False, 9), # Accessing 3 elements in test_op_n array (TEST_QUERY_16, False, True, 9), # " (TEST_QUERY_17, False, True, 9), # " - (TEST_QUERY_18, False, True, 45), # Accessing 2 biosamples, one with 2 tumor grades, the other with 3 +PLUS+ " + (TEST_QUERY_18, False, True, 9), # " + (TEST_QUERY_19, False, True, 45), # Accessing 2 biosamples, one with 2 tumor grades, the other with 3 +PLUS+ " + (TEST_QUERY_20, False, True, 45), # " ) # Query, Internal, Exception @@ -578,9 +574,10 @@ (TEST_QUERY_13, False, ("%TEST%", "TG2")), (TEST_QUERY_14, False, ("%DUMMY%", "TG2")), (TEST_QUERY_15, False, ()), - (TEST_QUERY_16, False, (11,)), - (TEST_QUERY_17, False, (7,)), - (TEST_QUERY_18, False, ("%TEST%", "TG2", 7)), + (TEST_QUERY_17, False, (11,)), + (TEST_QUERY_18, False, (7,)), + (TEST_QUERY_19, False, ("%TEST%", "TG2", 11)), + (TEST_QUERY_20, False, ("%TEST%", "TG2", 7)), ) PG_INVALID_EXPRESSIONS = ( From cdd1831e000d6a86316c50e75a1fe5cf16606e51 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 16:06:20 -0500 Subject: [PATCH 17/27] Split apart postgres query tests --- tests/test_search.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_search.py b/tests/test_search.py index 989ceab..3640fe4 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -660,7 +660,7 @@ def test_queries_and_ast(): queries.convert_query_to_ast_and_preprocess(a) -def test_postgres(): +def test_postgres_schemas(): null_schema = postgres.json_schema_to_postgres_schema("test", {"type": "integer"}) assert null_schema[0] is None and null_schema[1] is None @@ -672,9 +672,13 @@ def test_postgres(): } })[1] == f"test(test2 {p})" + +def test_postgres_collect_resolve_join_tables(): # TODO: This is sort of artificial; does this case actually arise? assert postgres.collect_resolve_join_tables((), {}, None, None) == () + +def test_postgres_invalid_schemas(): with raises(SyntaxError): postgres.search_query_to_psycopg2_sql(TEST_EXPR_4, TEST_INVALID_SCHEMA) @@ -687,13 +691,19 @@ def test_postgres(): with raises(SyntaxError): postgres.search_query_to_psycopg2_sql(["#resolve", "[item]"], TEST_INVALID_SCHEMA_2) + +def test_postgres_valid_queries(): for e, i, p in PG_VALID_QUERIES: _, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) assert params == p + +def test_postgres_valid_expressions(): for e, i, _v, _ic in DS_VALID_EXPRESSIONS: postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) + +def test_postgres_invalid_expressions(): for e, i, ex in PG_INVALID_EXPRESSIONS: with raises(ex): postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) From 322fad200e6a004bf427b18edc474faea78175ee Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Fri, 14 Feb 2020 17:25:55 -0500 Subject: [PATCH 18/27] Add a comment, shrink a line --- chord_lib/search/data_structure.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index 6a87eea..d8d4093 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -200,6 +200,9 @@ def uncurried_binary_op(args: List[AST], ds: QueryableStructure, schema: dict, i internal: bool) -> bool: # TODO: Standardize type safety / behaviour!!! + # Evaluate both sides of the binary expression. If there's a type error while trying to use a Python built-in, + # override it with a custom-message type error. + lhs = evaluate(args[0], ds, schema, ic, internal, validate=False) rhs = evaluate(args[1], ds, schema, ic, internal, validate=False) @@ -249,15 +252,17 @@ def _resolve_array_lengths( _resolve_checks(resolve, schema) + new_path = f"{path}.{resolve[0].value}" + + # The current data structure is an array, so return its length and recurse on its (potential) child arrays. if resolve[0].value == "[item]": return (path, len(resolving_ds), - tuple(_get_child_resolve_array_lengths(resolve[1:], resolving_ds, schema["items"], - f"{path}.{resolve[0].value}"))) + tuple(_get_child_resolve_array_lengths(resolve[1:], resolving_ds, schema["items"], new_path))) # Otherwise, it's an object, so keep traversing without doing anything return _resolve_array_lengths(resolve[1:], resolving_ds[resolve[0].value], schema["properties"][resolve[0].value], - f"{path}.{resolve[0].value}") + new_path) def _resolve_with_properties( From 3cec9ff086512e70f91870ebc263a44e19038c49 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 11:33:53 -0500 Subject: [PATCH 19/27] Add more data structure search tests --- tests/test_search.py | 86 +++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index 3640fe4..c69ed46 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -3,6 +3,24 @@ from pytest import raises +NUMBER_SEARCH = { + "operations": [ + operations.SEARCH_OP_LT, + operations.SEARCH_OP_LE, + operations.SEARCH_OP_GT, + operations.SEARCH_OP_GE, + operations.SEARCH_OP_EQ, + ], + "queryable": "all" +} + +JSONB_DB_SEARCH = { + "database": { + "type": "jsonb" + } +} + + TEST_SCHEMA = { "type": "object", "properties": { @@ -209,11 +227,7 @@ "label": {"type": "string"} }, "required": ["id", "label"], - "search": { - "database": { - "type": "jsonb" - } - } + "search": JSONB_DB_SEARCH } }, "search": { @@ -231,31 +245,29 @@ "type": "array", "items": { "type": "number", - "search": { - "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT, operations.SEARCH_OP_EQ], - "queryable": "all" - }, + "search": NUMBER_SEARCH, }, - "search": { - "database": { - "type": "jsonb" - } - } + "search": JSONB_DB_SEARCH }, "test_op_2": { "type": "array", "items": { "type": "number", - "search": { - "operations": [operations.SEARCH_OP_LT, operations.SEARCH_OP_GT, operations.SEARCH_OP_EQ], - "queryable": "all" + "search": NUMBER_SEARCH, + }, + "search": JSONB_DB_SEARCH + }, + "test_op_3": { + "type": "array", + "items": { + "type": "array", + "items": { + "type": "number", + "search": NUMBER_SEARCH }, + "search": JSONB_DB_SEARCH }, - "search": { - "database": { - "type": "jsonb" - } - } + "search": JSONB_DB_SEARCH }, # TODO: Metadata (one-to-one) example }, @@ -405,6 +417,8 @@ TEST_QUERY_18 = ["#and", TEST_QUERY_16, ["#eq", ["#resolve", "test_op_1", "[item]"], 7]] TEST_QUERY_19 = ["#and", TEST_QUERY_13, TEST_QUERY_17] TEST_QUERY_20 = ["#and", TEST_QUERY_13, TEST_QUERY_18] +TEST_QUERY_21 = ["#eq", ["#resolve", "test_op_2", "[item]"], ["#resolve", "test_op_3", "[item]", "[item]"]] +TEST_QUERY_22 = ["#eq", ["#resolve", "test_op_3", "[item]", "[item]"], ["#resolve", "test_op_3", "[item]", "[item]"]] TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? @@ -465,6 +479,7 @@ }, "test_op_1": [5, 6, 7], "test_op_2": [9, 10, 11], + "test_op_3": [[1, 2, 3], [4, 5], [6, 7, 8, 9]], "biosamples": [ { "procedure": {"code": {"id": "TEST", "label": "TEST LABEL"}}, @@ -527,6 +542,8 @@ (TEST_QUERY_18, False, True, 9), # " (TEST_QUERY_19, False, True, 45), # Accessing 2 biosamples, one with 2 tumor grades, the other with 3 +PLUS+ " (TEST_QUERY_20, False, True, 45), # " + (TEST_QUERY_21, False, True, 27), # Accessing 3 elements in test_op_2, plus 9 in test_op_3 (non-flattened) + (TEST_QUERY_22, False, True, 9), # Accessing 9 in test_op_3 and checking them against itself ) # Query, Internal, Exception @@ -559,15 +576,15 @@ # Query, Internal, Parameters PG_VALID_QUERIES = ( - (TEST_QUERY_1, False, ("XO",)), - (TEST_QUERY_2, False, ("%TE%",)), - (TEST_QUERY_3, False, ("XO", "%TE%")), - (TEST_QUERY_4, False, ("XO", "%TE%", False)), - (TEST_QUERY_5, False, ("XO", "%TE%", False)), - (TEST_QUERY_6, False, ("some_non_bool_value",)), - (TEST_QUERY_7, True, ("1ac54805-4145-4829-93e2-f362de55f28f",)), - (TEST_QUERY_8, False, ("MALE",)), - (TEST_QUERY_9, False, ("NCBITaxon:9606",)), + (TEST_QUERY_1, False, ("XO",)), + (TEST_QUERY_2, False, ("%TE%",)), + (TEST_QUERY_3, False, ("XO", "%TE%")), + (TEST_QUERY_4, False, ("XO", "%TE%", False)), + (TEST_QUERY_5, False, ("XO", "%TE%", False)), + (TEST_QUERY_6, False, ("some_non_bool_value",)), + (TEST_QUERY_7, True, ("1ac54805-4145-4829-93e2-f362de55f28f",)), + (TEST_QUERY_8, False, ("MALE",)), + (TEST_QUERY_9, False, ("NCBITaxon:9606",)), (TEST_QUERY_10, False, ("test_value",)), (TEST_QUERY_11, False, ("test_value",)), (TEST_QUERY_12, False, ("%TEST%", "TG4")), @@ -578,6 +595,8 @@ (TEST_QUERY_18, False, (7,)), (TEST_QUERY_19, False, ("%TEST%", "TG2", 11)), (TEST_QUERY_20, False, ("%TEST%", "TG2", 7)), + (TEST_QUERY_21, False, ()), + (TEST_QUERY_22, False, ()), ) PG_INVALID_EXPRESSIONS = ( @@ -694,7 +713,10 @@ def test_postgres_invalid_schemas(): def test_postgres_valid_queries(): for e, i, p in PG_VALID_QUERIES: - _, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) + q, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) + from psycopg2 import connect + with connect("dbname=metadata user=admin password=admin host=localhost") as conn: + print(q.as_string(conn)) assert params == p From b0be7de938e4ee7b51aa711a82b0a218a6a7557b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 11:48:13 -0500 Subject: [PATCH 20/27] Add another test, fix accidental commit of local code --- tests/test_search.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_search.py b/tests/test_search.py index c69ed46..4cd19d6 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -419,6 +419,11 @@ TEST_QUERY_20 = ["#and", TEST_QUERY_13, TEST_QUERY_18] TEST_QUERY_21 = ["#eq", ["#resolve", "test_op_2", "[item]"], ["#resolve", "test_op_3", "[item]", "[item]"]] TEST_QUERY_22 = ["#eq", ["#resolve", "test_op_3", "[item]", "[item]"], ["#resolve", "test_op_3", "[item]", "[item]"]] +TEST_QUERY_23 = [ + "#and", + ["#eq", ["#resolve", "test_op_1", "[item]"], 6], + ["#eq", ["#resolve", "test_op_3", "[item]", "[item]"], 8] +] TEST_EXPR_1 = TEST_QUERY_6 TEST_EXPR_2 = True # TODO: What to do in this case when it's a query? @@ -544,6 +549,7 @@ (TEST_QUERY_20, False, True, 45), # " (TEST_QUERY_21, False, True, 27), # Accessing 3 elements in test_op_2, plus 9 in test_op_3 (non-flattened) (TEST_QUERY_22, False, True, 9), # Accessing 9 in test_op_3 and checking them against itself + (TEST_QUERY_23, False, True, 27), # test_op_3: 9, test_op_1: 3 ) # Query, Internal, Exception @@ -597,6 +603,7 @@ (TEST_QUERY_20, False, ("%TEST%", "TG2", 7)), (TEST_QUERY_21, False, ()), (TEST_QUERY_22, False, ()), + (TEST_QUERY_23, False, (6, 8)), ) PG_INVALID_EXPRESSIONS = ( @@ -713,10 +720,7 @@ def test_postgres_invalid_schemas(): def test_postgres_valid_queries(): for e, i, p in PG_VALID_QUERIES: - q, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) - from psycopg2 import connect - with connect("dbname=metadata user=admin password=admin host=localhost") as conn: - print(q.as_string(conn)) + _, params = postgres.search_query_to_psycopg2_sql(e, TEST_SCHEMA, i) assert params == p From ad32b7e9ce834fd1c6bb6c0937ebb061280c4582 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 12:08:54 -0500 Subject: [PATCH 21/27] Add doc text for dict combine in data_structure --- chord_lib/search/data_structure.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index d8d4093..6a6fa1d 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -120,7 +120,12 @@ def _collect_array_lengths(ast: AST, data_structure: QueryableStructure, schema: ) -def _dict_combine(dicts): +def _dict_combine(dicts: Iterable[dict]): + """ + Utility function to combine an iterable of dictionaries into a single dictionary via d.update(d2) + :param dicts: Iterable of dictionaries + :return: A single, combined dictionary + """ c = {} for d in dicts: c.update(d) From 6f53c18107af4f8bbf0b26634a85f8b90c05b0b1 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 12:48:34 -0500 Subject: [PATCH 22/27] Add more typing info --- chord_lib/search/data_structure.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index 6a6fa1d..dd3bde8 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -12,6 +12,8 @@ QueryableStructure = Union[dict, list, str, int, float, bool] BBOperator = Callable[[QueryableStructure, QueryableStructure], bool] +FunctionArgs = List[AST] + IndexCombination = Dict[str, int] ArrayLengthData = Tuple[str, int, Tuple["ArrayLengthData", ...]] @@ -193,7 +195,7 @@ def check_ast_against_data_structure( def _binary_op(op: BBOperator)\ - -> Callable[[List[AST], QueryableStructure, dict, bool, Optional[IndexCombination]], bool]: + -> Callable[[FunctionArgs, QueryableStructure, dict, bool, Optional[IndexCombination]], bool]: """ Returns a lambda which will evaluate a boolean-returning binary operator on a pair of arguments against a data structure/object of some type and return a Boolean result. @@ -201,7 +203,7 @@ def _binary_op(op: BBOperator)\ :return: Operator lambda for use in evaluating expressions. """ - def uncurried_binary_op(args: List[AST], ds: QueryableStructure, schema: dict, ic: Optional[IndexCombination], + def uncurried_binary_op(args: FunctionArgs, ds: QueryableStructure, schema: dict, ic: Optional[IndexCombination], internal: bool) -> bool: # TODO: Standardize type safety / behaviour!!! @@ -310,7 +312,7 @@ def _resolve_with_properties( def _resolve(resolve: List[Literal], resolving_ds: QueryableStructure, schema: dict, - index_combination: Optional[Dict[str, int]], internal: bool = False): + index_combination: Optional[IndexCombination], internal: bool = False): """ Does the same thing as _resolve_with_properties, but discards the search properties. """ @@ -319,7 +321,7 @@ def _resolve(resolve: List[Literal], resolving_ds: QueryableStructure, schema: d QUERY_CHECK_SWITCH: Dict[ FunctionName, - Callable[[List[AST], QueryableStructure, dict, bool, Optional[IndexCombination]], QueryableStructure] + Callable[[FunctionArgs, QueryableStructure, dict, bool, Optional[IndexCombination]], QueryableStructure] ] = { FUNCTION_AND: _binary_op(and_), FUNCTION_OR: _binary_op(or_), From f4f82e908b33bd4d0d517f28164b7f73b1d4e56b Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 12:48:58 -0500 Subject: [PATCH 23/27] Add some more doc text to data_structure fns --- chord_lib/search/data_structure.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index dd3bde8..a4aa162 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -136,6 +136,13 @@ def _dict_combine(dicts: Iterable[dict]): def _create_index_combinations(array_data: ArrayLengthData, parent_template: IndexCombination) \ -> Iterable[IndexCombination]: + """ + Creates combinations of array indices from a particular array (including children, NOT including siblings.) + :param array_data: Information about an array's length and its children's lengths + :param parent_template: A dictionary with information about the array's parent's current fixed indexed configuration + :return: An iterable of different combinations of fixed indices for the array and it's children (for later search) + """ + for i in range(array_data[1]): item_template = {**parent_template, array_data[0]: i} @@ -149,10 +156,19 @@ def _create_index_combinations(array_data: ArrayLengthData, parent_template: Ind def _create_all_index_combinations(arrays_data: Iterable[ArrayLengthData], parent_template: IndexCombination) \ -> Iterable[IndexCombination]: + """ + Creates combinations of array indexes for all siblings in an iterable of arrays' length data. + :param arrays_data: An iterable of arrays' length data + :param parent_template: A dictionary with information about the arrays' parent's current fixed indexed configuration + :return: An iterable of different combinations of fixed indices for the arrays and their children (for later search) + """ + # Loop through and recurse combination_sets = (_create_index_combinations(array_data, parent_template) for array_data in arrays_data) # Combine index mappings from different combination sets into a final list of array index combinations + # Takes the cross product of the combination sets, since they're parallel fixations and there may be inter-item + # comparisons between the two sets. # TODO: Do we need the combination superset replacement logic still? # combinations = [c for c in combinations if not any(c.items() < c2.items() for c2 in combinations)] yield from map(_dict_combine, product(*combination_sets)) @@ -253,6 +269,8 @@ def _resolve_array_lengths( schema: dict, path="_root", ) -> Optional[ArrayLengthData]: + # TODO: yield multiple for children instead of having child tuple? + if len(resolve) == 0: # Resolve the root if it's an empty list return (path, len(resolving_ds), ()) if schema["type"] == "array" else None From 89e71aa3d32a33dd7fc5ec8c0d839ab794a83cbc Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 12:49:31 -0500 Subject: [PATCH 24/27] Use more functional code in a few places in data_structure --- chord_lib/search/data_structure.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index a4aa162..aa9f7c2 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -1,6 +1,7 @@ import jsonschema +from functools import partial from itertools import chain, product -from operator import and_, or_, not_, lt, le, eq, gt, ge, contains +from operator import and_, or_, not_, lt, le, eq, gt, ge, contains, is_not from typing import Callable, Dict, List, Iterable, Optional, Tuple, Union from .queries import * @@ -201,13 +202,10 @@ def check_ast_against_data_structure( # TODO: What to do here? Should be standardized, esp. w/r/t False returns # Loop through all combinations of array indices to freeze "[item]"s at particular indices across the whole query. - for ic in index_combinations: - # Don't need to re-validate data structure - e = evaluate(ast, data_structure, schema, ic, internal, validate=False) - if isinstance(e, bool) and e: - return True - - return False + return any((isinstance(e, bool) and e for e in ( + evaluate(ast, data_structure, schema, ic, internal, validate=False) + for ic in index_combinations + ))) def _binary_op(op: BBOperator)\ @@ -254,13 +252,15 @@ def _resolve_checks(resolve: List[Literal], schema: dict): raise TypeError("Cannot get property of array") +_is_not_none = partial(is_not, None) + + def _get_child_resolve_array_lengths(new_resolve: List[Literal], resolving_ds: List, item_schema: dict, new_path: str) \ -> Iterable[ArrayLengthData]: - for i in range(len(resolving_ds)): - r = _resolve_array_lengths(new_resolve, resolving_ds[i], item_schema, new_path) - # Unwrap the optional - if r is not None: - yield r + return filter(_is_not_none, ( + _resolve_array_lengths(new_resolve, array_item_ds, item_schema, new_path) + for array_item_ds in resolving_ds + )) def _resolve_array_lengths( From 3258742922b42777ac838114e59a0e8ae63e7bee Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 14:33:01 -0500 Subject: [PATCH 25/27] Add more documentation to data_structure functions --- chord_lib/search/data_structure.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/chord_lib/search/data_structure.py b/chord_lib/search/data_structure.py index aa9f7c2..8d99ef1 100644 --- a/chord_lib/search/data_structure.py +++ b/chord_lib/search/data_structure.py @@ -257,6 +257,14 @@ def _resolve_checks(resolve: List[Literal], schema: dict): def _get_child_resolve_array_lengths(new_resolve: List[Literal], resolving_ds: List, item_schema: dict, new_path: str) \ -> Iterable[ArrayLengthData]: + """ + Recursively resolve array lengths for all children of elements of an array using the _resolve_array_length function. + :param new_resolve: The resolve path starting after the item access of the array being processed + :param resolving_ds: The array data structure whose elements we're resolving child array accesses of + :param item_schema: The JSON schema of the array's items + :param new_path: The string representation of the path followed so far, including the most recent item access + :return: A tuple of the current array's element-wise array length data + """ return filter(_is_not_none, ( _resolve_array_lengths(new_resolve, array_item_ds, item_schema, new_path) for array_item_ds in resolving_ds @@ -269,6 +277,17 @@ def _resolve_array_lengths( schema: dict, path="_root", ) -> Optional[ArrayLengthData]: + """ + Given a resolve path and a data structure, find lengths of any arrays in the current data structure and any + descendents it may have. + :param resolve: The current resolve path, where the first element is the next thing to resolve on the data structure + :param resolving_ds: The data structure we're resolving on + :param schema: A JSON schema modeling the current data structure + :param path: A string representation of the path followed so far, including the most recent access + :return: Either none (if no arrays were accessed) or a tuple of the current array's path, its length, and the + lengths of any child array accesses + """ + # TODO: yield multiple for children instead of having child tuple? if len(resolve) == 0: From 4417af1719402d7b6577e65616e2325b8959bd14 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 14:33:12 -0500 Subject: [PATCH 26/27] Make a search case cover empty array case --- tests/test_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_search.py b/tests/test_search.py index 4cd19d6..f7a7826 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -484,7 +484,7 @@ }, "test_op_1": [5, 6, 7], "test_op_2": [9, 10, 11], - "test_op_3": [[1, 2, 3], [4, 5], [6, 7, 8, 9]], + "test_op_3": [[1, 2, 3], [4, 5], [], [6, 7, 8, 9]], "biosamples": [ { "procedure": {"code": {"id": "TEST", "label": "TEST LABEL"}}, From 9e5319782fad6760bac758235d9259f314e826e7 Mon Sep 17 00:00:00 2001 From: David Lougheed Date: Mon, 17 Feb 2020 14:44:47 -0500 Subject: [PATCH 27/27] Bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1b6c7bd..b568db9 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setuptools.setup( name="chord_lib", - version="0.2.0", + version="0.3.0", python_requires=">=3.6", install_requires=["jsonschema>=3.2.0,<4", "psycopg2-binary>=2.7,<3.0", "redis>=3.3,<4.0", "Werkzeug>=1.0,<2.0"],