From 16c79d231dc609966e4d3af7b24886c309ac819f Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Dec 2024 15:25:21 -0500 Subject: [PATCH 01/13] MNT: rename UnionStructure to ConsolidatedStructure --- tiled/_tests/test_writing.py | 24 +++++++++---------- tiled/catalog/adapter.py | 18 +++++++------- ...4112_add_union_to_structure_family_enum.py | 4 ++-- tiled/catalog/orm.py | 2 +- tiled/client/{union.py => consolidated.py} | 14 +++++------ tiled/client/container.py | 12 +++++----- tiled/server/dependencies.py | 6 ++--- tiled/server/links.py | 6 ++--- ...ntic_union.py => pydantic_consolidated.py} | 8 +++---- tiled/server/router.py | 14 +++++------ tiled/server/schemas.py | 10 ++++---- tiled/structures/core.py | 10 ++++---- tiled/structures/data_source.py | 6 ++--- tiled/structures/union.py | 8 +++---- 14 files changed, 71 insertions(+), 71 deletions(-) rename tiled/client/{union.py => consolidated.py} (88%) rename tiled/server/{pydantic_union.py => pydantic_consolidated.py} (66%) diff --git a/tiled/_tests/test_writing.py b/tiled/_tests/test_writing.py index 117edd6f0..8e70ed864 100644 --- a/tiled/_tests/test_writing.py +++ b/tiled/_tests/test_writing.py @@ -676,7 +676,7 @@ def test_append_partition( assert_frame_equal(x.read(), df3, check_dtype=False) -def test_union_one_table(tree): +def test_consolidated_one_table(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df = pandas.DataFrame({"A": [], "B": []}) @@ -686,17 +686,17 @@ def test_union_one_table(tree): structure=structure, name="table", ) - client.create_union([data_source], key="x") + client.create_consolidated([data_source], key="x") -def test_union_two_tables(tree): +def test_consolidated_two_tables(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) df2 = pandas.DataFrame({"C": [], "D": [], "E": []}) structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) - x = client.create_union( + x = client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -717,7 +717,7 @@ def test_union_two_tables(tree): x.parts["table2"].read() -def test_union_two_tables_colliding_names(tree): +def test_consolidated_two_tables_colliding_names(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -725,7 +725,7 @@ def test_union_two_tables_colliding_names(tree): structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -742,7 +742,7 @@ def test_union_two_tables_colliding_names(tree): ) -def test_union_two_tables_colliding_keys(tree): +def test_consolidated_two_tables_colliding_keys(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -750,7 +750,7 @@ def test_union_two_tables_colliding_keys(tree): structure1 = TableStructure.from_pandas(df1) structure2 = TableStructure.from_pandas(df2) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -767,7 +767,7 @@ def test_union_two_tables_colliding_keys(tree): ) -def test_union_two_tables_two_arrays(tree): +def test_consolidated_two_tables_two_arrays(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df1 = pandas.DataFrame({"A": [], "B": []}) @@ -778,7 +778,7 @@ def test_union_two_tables_two_arrays(tree): structure2 = TableStructure.from_pandas(df2) structure3 = ArrayStructure.from_array(arr1) structure4 = ArrayStructure.from_array(arr2) - x = client.create_union( + x = client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, @@ -820,7 +820,7 @@ def test_union_two_tables_two_arrays(tree): x[column].read() -def test_union_table_column_array_key_collision(tree): +def test_consolidated_table_column_array_key_collision(tree): with Context.from_app(build_app(tree)) as context: client = from_context(context) df = pandas.DataFrame({"A": [], "B": []}) @@ -828,7 +828,7 @@ def test_union_table_column_array_key_collision(tree): structure1 = TableStructure.from_pandas(df) structure2 = ArrayStructure.from_array(arr) with fail_with_status_code(422): - client.create_union( + client.create_consolidated( [ DataSource( structure_family=StructureFamily.table, diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index e88a23cdf..cb337c6ff 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -63,7 +63,7 @@ ) from ..query_registration import QueryTranslationRegistry from ..server.pydantic_container import ContainerStructure -from ..server.pydantic_union import UnionStructure, UnionStructurePart +from ..server.pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart from ..server.schemas import Asset, DataSource, Management, Revision, Spec from ..structures.core import StructureFamily from ..utils import ( @@ -381,12 +381,12 @@ def structure(self): if self.structure_family == StructureFamily.container: # Give no inlined contents. return ContainerStructure(contents=None, count=None) - if self.structure_family == StructureFamily.union: + if self.structure_family == StructureFamily.consolidated: parts = [] all_keys = [] for data_source in self.data_sources: parts.append( - UnionStructurePart( + ConsolidatedStructurePart( structure=data_source.structure, structure_family=data_source.structure_family, name=data_source.name, @@ -396,7 +396,7 @@ def structure(self): all_keys.extend(data_source.structure.columns) else: all_keys.append(data_source.name) - return UnionStructure(parts=parts, all_keys=all_keys) + return ConsolidatedStructure(parts=parts, all_keys=all_keys) if self.data_sources: assert len(self.data_sources) == 1 # more not yet implemented return self.data_sources[0].structure @@ -461,11 +461,11 @@ async def lookup_adapter( for i in range(len(segments)): catalog_adapter = await self.lookup_adapter(segments[:i]) - if (catalog_adapter.structure_family == StructureFamily.union) and len( + if (catalog_adapter.structure_family == StructureFamily.consolidated) and len( segments[i:] ) == 1: - # All the segments but the final segment, segments[-1], resolves - # resolve to a union structure. Dispatch to the union Adapter + # All the segments but the final segment, segments[-1], resolve + # to a consolidated structure. Dispatch to the consolidated Adapter # to get the inner Adapter for whatever type of structure it is. return await ensure_awaitable(catalog_adapter.get, segments[-1]) if catalog_adapter.data_sources: @@ -680,7 +680,7 @@ async def create_node( ] data_source.parameters = {} data_uri_path_parts = self.segments + [key] - if structure_family == StructureFamily.union: + if structure_family == StructureFamily.consolidated: data_uri_path_parts.append(data_source.name) data_uri = str(self.context.writable_storage) + "".join( f"/{quote_plus(segment)}" for segment in data_uri_path_parts @@ -1573,5 +1573,5 @@ def specs_array_to_json(specs): StructureFamily.container: CatalogContainerAdapter, StructureFamily.sparse: CatalogSparseAdapter, StructureFamily.table: CatalogTableAdapter, - StructureFamily.union: CatalogUnionAdapter, + StructureFamily.consolidated: CatalogUnionAdapter, } diff --git a/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py b/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py index bf83c8baa..5dc04c83f 100644 --- a/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py +++ b/tiled/catalog/migrations/versions/0dc110294112_add_union_to_structure_family_enum.py @@ -1,4 +1,4 @@ -"""Add 'union' to structure_family enum. +"""Add 'consolidated' to structure_family enum. Revision ID: 0dc110294112 Revises: 7c8130c40b8f @@ -22,7 +22,7 @@ def upgrade(): with op.get_context().autocommit_block(): op.execute( sa.text( - "ALTER TYPE structurefamily ADD VALUE IF NOT EXISTS 'union' AFTER 'table'" + "ALTER TYPE structurefamily ADD VALUE IF NOT EXISTS 'consolidated' AFTER 'table'" ) ) diff --git a/tiled/catalog/orm.py b/tiled/catalog/orm.py index a176a9daf..190969eec 100644 --- a/tiled/catalog/orm.py +++ b/tiled/catalog/orm.py @@ -369,7 +369,7 @@ class DataSource(Timestamped, Base): # This relates to the mutability of the data. management = Column(Enum(Management), nullable=False) structure_family = Column(Enum(StructureFamily), nullable=False) - # This is used by `union` structures to address arrays. + # This is used by `consolidated` structures to address arrays. # It may have additional uses in the future. name = Column(Unicode(1023), nullable=True) diff --git a/tiled/client/union.py b/tiled/client/consolidated.py similarity index 88% rename from tiled/client/union.py rename to tiled/client/consolidated.py index 525837b89..58b8bb0ec 100644 --- a/tiled/client/union.py +++ b/tiled/client/consolidated.py @@ -5,7 +5,7 @@ from .utils import MSGPACK_MIME_TYPE, ClientError, client_for_item, handle_error -class UnionClient(BaseClient): +class ConsolidatedClient(BaseClient): def __repr__(self): return ( f"<{type(self).__name__} {{" @@ -15,7 +15,7 @@ def __repr__(self): @property def parts(self): - return UnionContents(self) + return ConsolidatedContents(self) def __getitem__(self, key): if key not in self.structure().all_keys: @@ -48,7 +48,7 @@ def __getitem__(self, key): ) -class UnionContents: +class ConsolidatedContents: def __init__(self, node): self.node = node @@ -60,10 +60,10 @@ def __repr__(self): ) def __getitem__(self, name): - for index, union_item in enumerate(self.node.structure().parts): - if union_item.name == name: - structure_family = union_item.structure_family - structure_dict = union_item.structure + for index, item in enumerate(self.node.structure().parts): + if item.name == name: + structure_family = item.structure_family + structure_dict = item.structure break else: raise KeyError(name) diff --git a/tiled/client/container.py b/tiled/client/container.py index 937059b32..708c15600 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -643,7 +643,7 @@ def new( if structure_family == StructureFamily.container: structure = {"contents": None, "count": None} - elif structure_family == StructureFamily.union: + elif structure_family == StructureFamily.consolidated: structure = None # To be filled in below, by server response. # We need the server to tell us data_source_ids. @@ -711,9 +711,9 @@ def create_container(self, key=None, *, metadata=None, dims=None, specs=None): specs=specs, ) - def create_union(self, data_sources, key=None, *, metadata=None, specs=None): + def create_consolidated(self, data_sources, key=None, *, metadata=None, specs=None): """ - EXPERIMENTAL: Create a new union backed by data sources. + EXPERIMENTAL: Create a new consolidated node backed by data sources. Parameters ---------- @@ -729,7 +729,7 @@ def create_union(self, data_sources, key=None, *, metadata=None, specs=None): """ return self.new( - StructureFamily.union, + StructureFamily.consolidated, data_sources, key=key, metadata=metadata, @@ -1083,7 +1083,7 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DataFrameClient" ), - "union": _LazyLoad(("..union", Container.__module__), "UnionClient"), + "consolidated": _LazyLoad(("..consolidated", Container.__module__), "ConsolidatedClient"), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DatasetClient" ), @@ -1102,7 +1102,7 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DaskDataFrameClient" ), - "union": _LazyLoad(("..union", Container.__module__), "UnionClient"), + "consolidated": _LazyLoad(("..consolidated", Container.__module__), "ConsolidatedClient"), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DaskDatasetClient" ), diff --git a/tiled/server/dependencies.py b/tiled/server/dependencies.py index 6cc699e6e..ef94a5172 100644 --- a/tiled/server/dependencies.py +++ b/tiled/server/dependencies.py @@ -132,14 +132,14 @@ async def inner( entry.structure_family in structure_families ): return entry - # Handle union structure_family - if entry.structure_family == StructureFamily.union: + # Handle consolidated structure_family + if entry.structure_family == StructureFamily.consolidated: if not part: raise HTTPException( status_code=HTTP_404_NOT_FOUND, detail=( "A part query parameter is required on this endpoint " - "when addressing a 'union' structure." + "when addressing a 'consolidated' structure." ), ) entry_for_part = entry.for_part(part) diff --git a/tiled/server/links.py b/tiled/server/links.py index ff186ebbc..2d1db43a6 100644 --- a/tiled/server/links.py +++ b/tiled/server/links.py @@ -37,7 +37,7 @@ def links_for_awkward(structure_family, structure, base_url, path_str, part=None def links_for_container(structure_family, structure, base_url, path_str): - # Cannot be used inside union, so there is no part parameter. + # Cannot be used inside consolidated, so there is no part parameter. links = {} links["full"] = f"{base_url}/container/full/{path_str}" links["search"] = f"{base_url}/search/{path_str}" @@ -54,7 +54,7 @@ def links_for_table(structure_family, structure, base_url, path_str, part=None): return links -def links_for_union(structure_family, structure, base_url, path_str): +def links_for_consolidated(structure_family, structure, base_url, path_str): links = {} # This contains the links for each structure. links["parts"] = [] @@ -77,5 +77,5 @@ def links_for_union(structure_family, structure, base_url, path_str): StructureFamily.container: links_for_container, StructureFamily.sparse: links_for_array, # sparse and array are the same StructureFamily.table: links_for_table, - StructureFamily.union: links_for_union, + StructureFamily.consolidated: links_for_consolidated, } diff --git a/tiled/server/pydantic_union.py b/tiled/server/pydantic_consolidated.py similarity index 66% rename from tiled/server/pydantic_union.py rename to tiled/server/pydantic_consolidated.py index 7d13645df..80a24c54f 100644 --- a/tiled/server/pydantic_union.py +++ b/tiled/server/pydantic_consolidated.py @@ -5,7 +5,7 @@ from ..structures.core import StructureFamily -class UnionStructurePart(pydantic.BaseModel): +class ConsolidatedStructurePart(pydantic.BaseModel): structure_family: StructureFamily structure: Any # Union of Structures, but we do not want to import them... name: str @@ -15,13 +15,13 @@ def from_json(cls, item): return cls(**item) -class UnionStructure(pydantic.BaseModel): - parts: List[UnionStructurePart] +class ConsolidatedStructure(pydantic.BaseModel): + parts: List[ConsolidatedStructurePart] all_keys: Optional[List[str]] @classmethod def from_json(cls, structure): return cls( - parts=[UnionStructurePart.from_json(item) for item in structure["parts"]], + parts=[ConsolidatedStructurePart.from_json(item) for item in structure["parts"]], all_keys=structure["all_keys"], ) diff --git a/tiled/server/router.py b/tiled/server/router.py index be4bd600c..05d0553a7 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -27,7 +27,7 @@ ) from .. import __version__ -from ..server.pydantic_union import UnionStructure, UnionStructurePart +from .pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart from ..structures.core import Spec, StructureFamily from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri from ..validation_registration import ValidationError @@ -423,7 +423,7 @@ async def array_block( "Use slicing ('?slice=...') to request smaller chunks." ), ) - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -464,7 +464,7 @@ async def array_full( """ Fetch a slice of array-like data. """ - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -729,7 +729,7 @@ async def table_full( "request a smaller chunks." ), ) - if entry.structure_family == StructureFamily.union: + if entry.structure_family == StructureFamily.consolidated: structure_family = entry.data_source.structure_family else: structure_family = entry.structure_family @@ -1160,16 +1160,16 @@ async def _create_node( body.specs, ) metadata_modified = False - if structure_family == StructureFamily.union: + if structure_family == StructureFamily.consolidated: all_keys = [] for data_source in body.data_sources: if data_source.structure_family == StructureFamily.table: all_keys.extend(data_source.structure.columns) else: all_keys.append(data_source.name) - structure = UnionStructure( + structure = ConsolidatedStructure( parts=[ - UnionStructurePart( + ConsolidatedStructurePart( data_source_id=data_source.id, structure=data_source.structure, structure_family=data_source.structure_family, diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index e0edd8ecf..71096f538 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -16,7 +16,7 @@ from .pydantic_awkward import AwkwardStructure from .pydantic_sparse import SparseStructure from .pydantic_table import TableStructure -from .pydantic_union import UnionStructure +from .pydantic_consolidated import ConsolidatedStructure if TYPE_CHECKING: import tiled.authn_database.orm @@ -149,7 +149,7 @@ class DataSource(pydantic.BaseModel): NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] ] = None mimetype: Optional[str] = None @@ -186,7 +186,7 @@ class NodeAttributes(pydantic.BaseModel): NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] ] = None @@ -248,7 +248,7 @@ class UnionLinks(pydantic.BaseModel): StructureFamily.container: ContainerLinks, StructureFamily.sparse: SparseLinks, StructureFamily.table: DataFrameLinks, - StructureFamily.union: UnionLinks, + StructureFamily.consolidated: UnionLinks, } @@ -487,7 +487,7 @@ class PostMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]): NodeStructure, SparseStructure, TableStructure, - UnionStructure, + ConsolidatedStructure, ] metadata: Dict data_sources: List[DataSource] diff --git a/tiled/structures/core.py b/tiled/structures/core.py index 8f5e10795..08618f53f 100644 --- a/tiled/structures/core.py +++ b/tiled/structures/core.py @@ -18,7 +18,7 @@ class BaseStructureFamily(str, enum.Enum): container = "container" sparse = "sparse" table = "table" - # excludes union, which DataSources cannot have + # excludes consolidated, which DataSources cannot have class StructureFamily(str, enum.Enum): @@ -27,7 +27,7 @@ class StructureFamily(str, enum.Enum): container = "container" sparse = "sparse" table = "table" - union = "union" + consolidated = "consolidated" @dataclass(frozen=True) @@ -73,8 +73,8 @@ def dict(self) -> Dict[str, Optional[str]]: StructureFamily.sparse: lambda: importlib.import_module( "...structures.sparse", StructureFamily.__module__ ).SparseStructure, - StructureFamily.union: lambda: importlib.import_module( - "...structures.union", StructureFamily.__module__ - ).UnionStructure, + StructureFamily.consolidated: lambda: importlib.import_module( + "...structures.consolidated", StructureFamily.__module__ + ).ConsolidatedStructure, } ) diff --git a/tiled/structures/data_source.py b/tiled/structures/data_source.py index c3c65bfca..b64b897d6 100644 --- a/tiled/structures/data_source.py +++ b/tiled/structures/data_source.py @@ -54,14 +54,14 @@ def validate_container_data_sources(node_structure_family, data_sources): return data_sources -def validate_union_data_sources(node_structure_family, data_sources): +def validate_consolidated_data_sources(node_structure_family, data_sources): "Check that column names and keys of others (e.g. arrays) do not collide." keys = set() names = set() for data_source in data_sources: if data_source.name is None: raise ValueError( - "Data sources backing a union structure_family must " + "Data sources backing a consolidated structure_family must " "all have non-NULL names." ) if data_source.name in names: @@ -95,4 +95,4 @@ def validate_other_data_sources(node_structure_family, data_sources): validators = collections.defaultdict(lambda: validate_other_data_sources) validators[StructureFamily.container] = validate_container_data_sources -validators[StructureFamily.union] = validate_union_data_sources +validators[StructureFamily.consolidated] = validate_consolidated_data_sources diff --git a/tiled/structures/union.py b/tiled/structures/union.py index 3d4a6cc4b..47cc2e43c 100644 --- a/tiled/structures/union.py +++ b/tiled/structures/union.py @@ -5,7 +5,7 @@ @dataclasses.dataclass -class UnionStructurePart: +class ConsolidatedStructurePart: structure_family: StructureFamily structure: Any # Union of Structures, but we do not want to import them... name: Optional[str] @@ -16,13 +16,13 @@ def from_json(cls, item): @dataclasses.dataclass -class UnionStructure: - parts: List[UnionStructurePart] +class ConsolidatedStructure: + parts: List[ConsolidatedStructurePart] all_keys: List[str] @classmethod def from_json(cls, structure): return cls( - parts=[UnionStructurePart.from_json(item) for item in structure["parts"]], + parts=[ConsolidatedStructurePart.from_json(item) for item in structure["parts"]], all_keys=structure["all_keys"], ) From af06a43f3f877ed7ee40652f19e59d33aabf735e Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Dec 2024 15:51:16 -0500 Subject: [PATCH 02/13] MNT: rename UnionStructure to ConsolidatedStructure --- tiled/structures/{union.py => consolidated.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tiled/structures/{union.py => consolidated.py} (100%) diff --git a/tiled/structures/union.py b/tiled/structures/consolidated.py similarity index 100% rename from tiled/structures/union.py rename to tiled/structures/consolidated.py From d8ef72c78c939338d921a4e3cfd7426a266ecbdd Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Dec 2024 16:02:01 -0500 Subject: [PATCH 03/13] MNT: rename CatalogUnionAdapter and UnionLinks --- tiled/catalog/adapter.py | 4 ++-- tiled/server/schemas.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index cb337c6ff..62c7c1945 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -1155,7 +1155,7 @@ async def append_partition(self, *args, **kwargs): ) -class CatalogUnionAdapter(CatalogNodeAdapter): +class CatalogConsolidatedAdapter(CatalogNodeAdapter): async def get(self, key): if key not in self.structure().all_keys: return None @@ -1573,5 +1573,5 @@ def specs_array_to_json(specs): StructureFamily.container: CatalogContainerAdapter, StructureFamily.sparse: CatalogSparseAdapter, StructureFamily.table: CatalogTableAdapter, - StructureFamily.consolidated: CatalogUnionAdapter, + StructureFamily.consolidated: CatalogConsolidatedAdapter, } diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index 71096f538..f9773d5c6 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -235,7 +235,7 @@ class SparseLinks(pydantic.BaseModel): block: str -class UnionLinks(pydantic.BaseModel): +class ConsolidatedLinks(pydantic.BaseModel): self: str contents: List[ Union[ArrayLinks, AwkwardLinks, ContainerLinks, DataFrameLinks, SparseLinks] @@ -248,7 +248,7 @@ class UnionLinks(pydantic.BaseModel): StructureFamily.container: ContainerLinks, StructureFamily.sparse: SparseLinks, StructureFamily.table: DataFrameLinks, - StructureFamily.consolidated: UnionLinks, + StructureFamily.consolidated: ConsolidatedLinks, } @@ -480,7 +480,7 @@ class PutDataSourceRequest(pydantic.BaseModel): class PostMetadataResponse(pydantic.BaseModel, Generic[ResourceLinksT]): id: str - links: Union[ArrayLinks, DataFrameLinks, SparseLinks, UnionLinks] + links: Union[ArrayLinks, DataFrameLinks, SparseLinks, ConsolidatedLinks] structure: Union[ ArrayStructure, AwkwardStructure, From 2380800548b9a74f605688bcf1777f49155d8d61 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Dec 2024 17:29:19 -0500 Subject: [PATCH 04/13] MNT: typing and lint --- tiled/adapters/arrow.py | 2 +- tiled/adapters/csv.py | 2 +- tiled/adapters/parquet.py | 3 ++- tiled/adapters/table.py | 4 ++-- tiled/catalog/adapter.py | 11 +++++++---- tiled/client/container.py | 8 ++++++-- tiled/server/pydantic_consolidated.py | 4 +++- tiled/server/router.py | 2 +- tiled/server/schemas.py | 4 ++-- tiled/structures/consolidated.py | 4 +++- tiled/structures/core.py | 11 +---------- tiled/structures/data_source.py | 10 ++++++++-- 12 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tiled/adapters/arrow.py b/tiled/adapters/arrow.py index 9c2da456c..975c27fa0 100644 --- a/tiled/adapters/arrow.py +++ b/tiled/adapters/arrow.py @@ -129,7 +129,7 @@ def generate_data_sources( """ return [ DataSource( - structure_family=self.structure_family, + structure_family=StructureFamily.table, mimetype=mimetype, structure=dict_or_none(self.structure()), parameters={}, diff --git a/tiled/adapters/csv.py b/tiled/adapters/csv.py index 8e9fd6680..b1dd6711f 100644 --- a/tiled/adapters/csv.py +++ b/tiled/adapters/csv.py @@ -276,7 +276,7 @@ def generate_data_sources( """ return [ DataSource( - structure_family=self.dataframe_adapter.structure_family, + structure_family=StructureFamily.table, mimetype=mimetype, structure=dict_or_none(self.dataframe_adapter.structure()), parameters={}, diff --git a/tiled/adapters/parquet.py b/tiled/adapters/parquet.py index 3438dc995..b8421396d 100644 --- a/tiled/adapters/parquet.py +++ b/tiled/adapters/parquet.py @@ -9,6 +9,7 @@ from ..structures.table import TableStructure from ..type_aliases import JSON from ..utils import path_from_uri +from .array import ArrayAdapter from .dataframe import DataFrameAdapter from .protocols import AccessPolicy @@ -165,5 +166,5 @@ def structure(self) -> TableStructure: """ return self._structure - def get(self, key): + def get(self, key: str) -> ArrayAdapter | None: return self.dataframe_adapter.get(key) diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py index 01a89a5d7..8f13461c6 100644 --- a/tiled/adapters/table.py +++ b/tiled/adapters/table.py @@ -169,12 +169,12 @@ def __getitem__(self, key: str) -> ArrayAdapter: # Must compute to determine shape. return ArrayAdapter.from_array(self.read([key])[key].values) - def get(self, key): + def get(self, key: str) -> ArrayAdapter | None: if key not in self.structure().columns: return None return ArrayAdapter.from_array(self.read([key])[key].values) - def items(self): + def items(self) -> Iterator[Tuple[str, ArrayAdapter]]: yield from ( (key, ArrayAdapter.from_array(self.read([key])[key].values)) for key in self._structure.columns diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index 62c7c1945..11b3b5700 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -62,8 +62,11 @@ ZARR_MIMETYPE, ) from ..query_registration import QueryTranslationRegistry +from ..server.pydantic_consolidated import ( + ConsolidatedStructure, + ConsolidatedStructurePart, +) from ..server.pydantic_container import ContainerStructure -from ..server.pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart from ..server.schemas import Asset, DataSource, Management, Revision, Spec from ..structures.core import StructureFamily from ..utils import ( @@ -461,9 +464,9 @@ async def lookup_adapter( for i in range(len(segments)): catalog_adapter = await self.lookup_adapter(segments[:i]) - if (catalog_adapter.structure_family == StructureFamily.consolidated) and len( - segments[i:] - ) == 1: + if ( + catalog_adapter.structure_family == StructureFamily.consolidated + ) and len(segments[i:]) == 1: # All the segments but the final segment, segments[-1], resolve # to a consolidated structure. Dispatch to the consolidated Adapter # to get the inner Adapter for whatever type of structure it is. diff --git a/tiled/client/container.py b/tiled/client/container.py index 708c15600..238967f2a 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -1083,7 +1083,9 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DataFrameClient" ), - "consolidated": _LazyLoad(("..consolidated", Container.__module__), "ConsolidatedClient"), + "consolidated": _LazyLoad( + ("..consolidated", Container.__module__), "ConsolidatedClient" + ), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DatasetClient" ), @@ -1102,7 +1104,9 @@ def _write_partition(x, partition_info, client): "table": _LazyLoad( ("..dataframe", Container.__module__), "DaskDataFrameClient" ), - "consolidated": _LazyLoad(("..consolidated", Container.__module__), "ConsolidatedClient"), + "consolidated": _LazyLoad( + ("..consolidated", Container.__module__), "ConsolidatedClient" + ), "xarray_dataset": _LazyLoad( ("..xarray", Container.__module__), "DaskDatasetClient" ), diff --git a/tiled/server/pydantic_consolidated.py b/tiled/server/pydantic_consolidated.py index 80a24c54f..00de74489 100644 --- a/tiled/server/pydantic_consolidated.py +++ b/tiled/server/pydantic_consolidated.py @@ -22,6 +22,8 @@ class ConsolidatedStructure(pydantic.BaseModel): @classmethod def from_json(cls, structure): return cls( - parts=[ConsolidatedStructurePart.from_json(item) for item in structure["parts"]], + parts=[ + ConsolidatedStructurePart.from_json(item) for item in structure["parts"] + ], all_keys=structure["all_keys"], ) diff --git a/tiled/server/router.py b/tiled/server/router.py index 05d0553a7..301b38699 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -27,7 +27,6 @@ ) from .. import __version__ -from .pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart from ..structures.core import Spec, StructureFamily from ..utils import ensure_awaitable, patch_mimetypes, path_from_uri from ..validation_registration import ValidationError @@ -62,6 +61,7 @@ ) from .file_response_with_range import FileResponseWithRange from .links import links_for_node +from .pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing diff --git a/tiled/server/schemas.py b/tiled/server/schemas.py index f9773d5c6..58d996edb 100644 --- a/tiled/server/schemas.py +++ b/tiled/server/schemas.py @@ -14,9 +14,9 @@ from ..structures.data_source import Management, validate_data_sources from .pydantic_array import ArrayStructure from .pydantic_awkward import AwkwardStructure +from .pydantic_consolidated import ConsolidatedStructure from .pydantic_sparse import SparseStructure from .pydantic_table import TableStructure -from .pydantic_consolidated import ConsolidatedStructure if TYPE_CHECKING: import tiled.authn_database.orm @@ -237,7 +237,7 @@ class SparseLinks(pydantic.BaseModel): class ConsolidatedLinks(pydantic.BaseModel): self: str - contents: List[ + parts: List[ Union[ArrayLinks, AwkwardLinks, ContainerLinks, DataFrameLinks, SparseLinks] ] diff --git a/tiled/structures/consolidated.py b/tiled/structures/consolidated.py index 47cc2e43c..89b156719 100644 --- a/tiled/structures/consolidated.py +++ b/tiled/structures/consolidated.py @@ -23,6 +23,8 @@ class ConsolidatedStructure: @classmethod def from_json(cls, structure): return cls( - parts=[ConsolidatedStructurePart.from_json(item) for item in structure["parts"]], + parts=[ + ConsolidatedStructurePart.from_json(item) for item in structure["parts"] + ], all_keys=structure["all_keys"], ) diff --git a/tiled/structures/core.py b/tiled/structures/core.py index 08618f53f..5612b1623 100644 --- a/tiled/structures/core.py +++ b/tiled/structures/core.py @@ -12,22 +12,13 @@ from ..utils import OneShotCachedMap -class BaseStructureFamily(str, enum.Enum): - array = "array" - awkward = "awkward" - container = "container" - sparse = "sparse" - table = "table" - # excludes consolidated, which DataSources cannot have - - class StructureFamily(str, enum.Enum): array = "array" awkward = "awkward" container = "container" sparse = "sparse" table = "table" - consolidated = "consolidated" + consolidated = "consolidated" # can not be used in DataSources @dataclass(frozen=True) diff --git a/tiled/structures/data_source.py b/tiled/structures/data_source.py index b64b897d6..679fc0346 100644 --- a/tiled/structures/data_source.py +++ b/tiled/structures/data_source.py @@ -3,7 +3,7 @@ import enum from typing import Any, List, Optional -from ..structures.core import BaseStructureFamily, StructureFamily +from ..structures.core import StructureFamily class Management(str, enum.Enum): @@ -24,7 +24,7 @@ class Asset: @dataclasses.dataclass class DataSource: - structure_family: BaseStructureFamily + structure_family: StructureFamily structure: Any id: Optional[int] = None mimetype: Optional[str] = None @@ -33,6 +33,12 @@ class DataSource: management: Management = Management.writable name: Optional[str] = None + def __post_init__(self): + if self.structure_family == StructureFamily.consolidated: + raise ValueError( + "DataSource can not be intialized with Consolidated StructureFamliy" + ) + @classmethod def from_json(cls, d): d = d.copy() From b76e8fdff19bfc88495465c60a0984e6886086d4 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Thu, 12 Dec 2024 17:31:33 -0500 Subject: [PATCH 05/13] MNT: typing and lint --- tiled/adapters/csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiled/adapters/csv.py b/tiled/adapters/csv.py index b1dd6711f..8e9fd6680 100644 --- a/tiled/adapters/csv.py +++ b/tiled/adapters/csv.py @@ -276,7 +276,7 @@ def generate_data_sources( """ return [ DataSource( - structure_family=StructureFamily.table, + structure_family=self.dataframe_adapter.structure_family, mimetype=mimetype, structure=dict_or_none(self.dataframe_adapter.structure()), parameters={}, From 1fcc9dc9fb4e57c39dbeca7abbbee945430c9464 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 10:27:23 -0500 Subject: [PATCH 06/13] ENH: refactor creation of ConsolidatedStructure as a classmethod --- tiled/catalog/adapter.py | 16 +--------------- tiled/server/pydantic_consolidated.py | 20 ++++++++++++++++++++ tiled/server/router.py | 19 +------------------ tiled/structures/consolidated.py | 20 ++++++++++++++++++++ 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index 11b3b5700..a979b4ccd 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -385,21 +385,7 @@ def structure(self): # Give no inlined contents. return ContainerStructure(contents=None, count=None) if self.structure_family == StructureFamily.consolidated: - parts = [] - all_keys = [] - for data_source in self.data_sources: - parts.append( - ConsolidatedStructurePart( - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - ) - if data_source.structure_family == StructureFamily.table: - all_keys.extend(data_source.structure.columns) - else: - all_keys.append(data_source.name) - return ConsolidatedStructure(parts=parts, all_keys=all_keys) + return ConsolidatedStructure.from_data_sources(self.data_sources) if self.data_sources: assert len(self.data_sources) == 1 # more not yet implemented return self.data_sources[0].structure diff --git a/tiled/server/pydantic_consolidated.py b/tiled/server/pydantic_consolidated.py index 00de74489..2ad79ecb5 100644 --- a/tiled/server/pydantic_consolidated.py +++ b/tiled/server/pydantic_consolidated.py @@ -27,3 +27,23 @@ def from_json(cls, structure): ], all_keys=structure["all_keys"], ) + + @classmethod + def from_data_sources(cls, data_sources): + all_keys = [] + for data_source in data_sources: + if data_source.structure_family == StructureFamily.table: + all_keys.extend(data_source.structure.columns) + else: + all_keys.append(data_source.name) + parts=[ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] + + return cls(parts=parts, all_keys=all_keys) diff --git a/tiled/server/router.py b/tiled/server/router.py index 301b38699..9e1d111cc 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -1161,24 +1161,7 @@ async def _create_node( ) metadata_modified = False if structure_family == StructureFamily.consolidated: - all_keys = [] - for data_source in body.data_sources: - if data_source.structure_family == StructureFamily.table: - all_keys.extend(data_source.structure.columns) - else: - all_keys.append(data_source.name) - structure = ConsolidatedStructure( - parts=[ - ConsolidatedStructurePart( - data_source_id=data_source.id, - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - for data_source in body.data_sources - ], - all_keys=all_keys, - ) + structure = ConsolidatedStructure.from_data_sources(body.data_sources) elif body.data_sources: assert len(body.data_sources) == 1 # more not yet implemented structure = body.data_sources[0].structure diff --git a/tiled/structures/consolidated.py b/tiled/structures/consolidated.py index 89b156719..8a955f256 100644 --- a/tiled/structures/consolidated.py +++ b/tiled/structures/consolidated.py @@ -28,3 +28,23 @@ def from_json(cls, structure): ], all_keys=structure["all_keys"], ) + + @classmethod + def from_data_sources(cls, data_sources): + all_keys = [] + for data_source in data_sources: + if data_source.structure_family == StructureFamily.table: + all_keys.extend(data_source.structure.columns) + else: + all_keys.append(data_source.name) + parts=[ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] + + return cls(parts=parts, all_keys=all_keys) From bdbb965fc11add7eb03dde82fe15f17c6dc7784a Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 12:10:26 -0500 Subject: [PATCH 07/13] ENH: allow iterating over ConsolidatedClient and its parts --- tiled/client/consolidated.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tiled/client/consolidated.py b/tiled/client/consolidated.py index 58b8bb0ec..3be5cfe29 100644 --- a/tiled/client/consolidated.py +++ b/tiled/client/consolidated.py @@ -46,6 +46,9 @@ def __getitem__(self, key): item, include_data_sources=self._include_data_sources, ) + + def __iter__(self): + yield from self.structure().all_keys class ConsolidatedContents: @@ -80,3 +83,7 @@ def __getitem__(self, name): structure=structure, include_data_sources=self.node._include_data_sources, ) + + def __iter__(self): + for item in self.node.structure().parts: + yield item.name \ No newline at end of file From 275f42c7b5909db696dc4511e86e5e1c4806b6ce Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 15:27:23 -0500 Subject: [PATCH 08/13] DOC: add Consolidated Structure to the docs --- docs/source/explanations/catalog.md | 3 +- docs/source/explanations/structures.md | 76 +++++++++++++++++++++++++- docs/source/how-to/register.md | 38 ++++++++++++- docs/source/reference/service.md | 2 + 4 files changed, 116 insertions(+), 3 deletions(-) diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md index 7b6f7be3e..9bd7df587 100644 --- a/docs/source/explanations/catalog.md +++ b/docs/source/explanations/catalog.md @@ -54,7 +54,8 @@ and `assets`, describes the format, structure, and location of the data. to the Adapter - `management` --- enum indicating whether the data is registered `"external"` data or `"writable"` data managed by Tiled -- `structure_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...) +- `structure_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, + etc. -- except for `consolidated`, which can not be assigned to a Data Source) - `structure_id` --- a foreign key to the `structures` table - `node_id` --- foreign key to `nodes` - `id` --- integer primary key diff --git a/docs/source/explanations/structures.md b/docs/source/explanations/structures.md index 65ce55deb..f72ead1d9 100644 --- a/docs/source/explanations/structures.md +++ b/docs/source/explanations/structures.md @@ -11,7 +11,8 @@ The structure families are: * array --- a strided array, like a [numpy](https://numpy.org) array * awkward --- nested, variable-sized data (as implemented by [AwkwardArray](https://awkward-array.org/)) -* container --- a of other structures, akin to a dictionary or a directory +* consolidated --- a container-like structure to combine tables and arrays in a common namespace +* container --- a collection of other structures, akin to a dictionary or a directory * sparse --- a sparse array (i.e. an array which is mostly zeros) * table --- tabular data, as in [Apache Arrow](https://arrow.apache.org) or [pandas](https://pandas.pydata.org/) @@ -575,3 +576,76 @@ response. "count": 5 } ``` + +### Consolidated + +This is a specialized container-like structure designed to link together multiple tables and arrays that store +related scientific data. It does not support nesting but provides a common namespace across all columns of the +contained tables along with the arrays (thus, name collisions are forbidden). This allows to further abstract out +the disparate internal storage mechanisms (e.g. Parquet for tables and zarr for arrays) and present the user with a +smooth homogeneous interface for data access. Consolidated structures do not support pagination and are not +recommended for "wide" datasets with more than ~1000 items (cloumns and arrays) in the namespace. + +Below is an example of a Consolidated structure that describes two tables and two arrays of various sizes. Their +respective structures are specfied in the `parts` list, and `all_keys` defines the internal namespace of directly +addressible columns and arrays. + +```json +{ + "parts": [ + { + "structure_family": "table", + "structure": { + "arrow_schema": "data:application/vnd.apache.arrow.file;base64,/////...FFFF", + "npartitions": 1, + "columns": ["A", "B"], + "resizable": false + }, + "name": "table1" + }, + { + "structure_family": "table", + "structure": { + "arrow_schema": "data:application/vnd.apache.arrow.file;base64,/////...FFFF", + "npartitions": 1, + "columns": ["C", "D", "E"], + "resizable": false + }, + "name": "table2" + }, + { + "structure_family": "array", + "structure": { + "data_type": { + "endianness": "little", + "kind": "f", + "itemsize": 8, + "dt_units": null + }, + "chunks": [[3], [5]], + "shape": [3, 5], + "dims": null, + "resizable": false + }, + "name": "F" + }, + { + "structure_family": "array", + "structure": { + "data_type": { + "endianness": "not_applicable", + "kind": "u", + "itemsize": 1, + "dt_units": null + }, + "chunks": [[5], [7], [3]], + "shape": [5, 7, 3], + "dims": null, + "resizable": false + }, + "name": "G" + } + ], + "all_keys": ["A", "B", "C", "D", "E", "F", "G"] +} +``` diff --git a/docs/source/how-to/register.md b/docs/source/how-to/register.md index 2ca73f068..9b98e6d99 100644 --- a/docs/source/how-to/register.md +++ b/docs/source/how-to/register.md @@ -72,7 +72,10 @@ Sometimes it is necessary to take more manual control of this registration process, such as if you want to take advantage of particular knowledge about the files to specify particular `metadata` or `specs`. -Use the Python client, as in this example. +#### Registering external data + +To register data from external files in Tiled, one can use the Python client and +construct Data Source object explicitly passing the list of assets, as in the following example. ```py import numpy @@ -112,3 +115,36 @@ client.new( specs=[], ) ``` + +#### Writing a consolidated structure + +Similarly, to create a consolidated container structure, one needs to specify +its constituents as separate Data Sources. For example, to consolidate a table +and an array, consider the following example + +```python +import pandas + +rng = numpy.random.default_rng(12345) +arr = rng.random(size=(3, 5), dtype="float64") +df = pandas.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]}) + +node = client.create_consolidated( + [ + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df), + name="table1", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr), + name="C", + ) + ] +) + +# Write the data +node.parts["table1"].write(df) +node.parts["C"].write_block(arr, (0, 0)) +``` \ No newline at end of file diff --git a/docs/source/reference/service.md b/docs/source/reference/service.md index 213a81d57..a28b00fb8 100644 --- a/docs/source/reference/service.md +++ b/docs/source/reference/service.md @@ -104,6 +104,8 @@ See {doc}`../explanations/structures` for more context. tiled.structures.array.BuiltinDtype tiled.structures.array.Endianness tiled.structures.array.Kind + tiled.structures.consolidated.ConsolidatedStructure + tiled.structures.consolidated.ConsolidatedStructurePart tiled.structures.core.Spec tiled.structures.core.StructureFamily tiled.structures.table.TableStructure From bb8fcafccac59ed62ba4b2689150265f79f15bbb Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 15:29:18 -0500 Subject: [PATCH 09/13] MNT: remove dims from the Container client signature --- tiled/client/container.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tiled/client/container.py b/tiled/client/container.py index 238967f2a..2e011fa51 100644 --- a/tiled/client/container.py +++ b/tiled/client/container.py @@ -685,7 +685,7 @@ def new( # to attempt to avoid bumping into size limits. _SUGGESTED_MAX_UPLOAD_SIZE = 100_000_000 # 100 MB - def create_container(self, key=None, *, metadata=None, dims=None, specs=None): + def create_container(self, key=None, *, metadata=None, specs=None): """ EXPERIMENTAL: Create a new, empty container. @@ -696,8 +696,6 @@ def create_container(self, key=None, *, metadata=None, dims=None, specs=None): metadata : dict, optional User metadata. May be nested. Must contain only basic types (e.g. numbers, strings, lists, dicts) that are JSON-serializable. - dims : List[str], optional - A label for each dimension of the array. specs : List[Spec], optional List of names that are used to label that the data and/or metadata conform to some named standard specification. @@ -718,11 +716,11 @@ def create_consolidated(self, data_sources, key=None, *, metadata=None, specs=No Parameters ---------- data_sources : List[DataSources] + key : str, optional + Key (name) for this new node. If None, the server will provide a unique key. metadata : dict, optional User metadata. May be nested. Must contain only basic types (e.g. numbers, strings, lists, dicts) that are JSON-serializable. - dims : List[str], optional - A label for each dimension of the array. specs : List[Spec], optional List of names that are used to label that the data and/or metadata conform to some named standard specification. From f3ec5f5bacc6a30cfb2ff07a1452496549c94cb4 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 16:10:33 -0500 Subject: [PATCH 10/13] TST: add tests for writing/reading consolidated structures --- tiled/_tests/test_consolidated.py | 89 +++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 tiled/_tests/test_consolidated.py diff --git a/tiled/_tests/test_consolidated.py b/tiled/_tests/test_consolidated.py new file mode 100644 index 000000000..f593b0886 --- /dev/null +++ b/tiled/_tests/test_consolidated.py @@ -0,0 +1,89 @@ +import math +import string +import warnings +from pathlib import Path + +import numpy +import pytest +import pandas +import pandas.testing + +from ..client import Context, from_context +from ..server.app import build_app +from ..structures.core import StructureFamily +from ..catalog import in_memory + +from ..structures.array import ArrayStructure +from ..structures.core import StructureFamily +from ..structures.data_source import DataSource +from ..structures.table import TableStructure + +rng = numpy.random.default_rng(12345) + +df1 = pandas.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]}) +df2 = pandas.DataFrame({"C": ["red", "green", "blue", "white"], "D": [10., 20., 30., 40.], "E": [0, 0, 0, 0]}) +arr1 = rng.random(size=(3, 5), dtype="float64") +arr2 = rng.integers(0, 255, size=(5, 7, 3), dtype="uint8") +md = {"md_key1":"md_val1", "md_key2":2} + +@pytest.fixture(scope="module") +def tree(tmp_path_factory): + return in_memory(writable_storage=tmp_path_factory.getbasetemp()) + +@pytest.fixture(scope="module") +def context(tree): + with Context.from_app(build_app(tree)) as context: + client = from_context(context) + x = client.create_consolidated( + [ + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df1), + name="table1", + ), + DataSource( + structure_family=StructureFamily.table, + structure=TableStructure.from_pandas(df2), + name="table2", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr1), + name="F", + ), + DataSource( + structure_family=StructureFamily.array, + structure=ArrayStructure.from_array(arr2), + name="G", + ), + ], + key="x", + metadata=md + ) + # Write by data source. + x.parts["table1"].write(df1) + x.parts["table2"].write(df2) + x.parts["F"].write_block(arr1, (0, 0)) + x.parts["G"].write_block(arr2, (0, 0, 0)) + + yield context + + +def test_iterate_parts(context): + client = from_context(context) + for part in client['x'].parts: + client['x'].parts[part].read() + + +def test_iterate_columns(context): + client = from_context(context) + for col in client['x']: + if col not in ('A', 'C'): + # TODO: reading string columns directly raises TypeError: Cannot interpret 'string[pyarrow]' as a data type + client['x'][col].read() + client[f'x/{col}'].read() + + +def test_metadata(context): + client = from_context(context) + assert client['x'].metadata == md From 7e74997ecefa60316bf935e184b0402cb48f475d Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 16:13:24 -0500 Subject: [PATCH 11/13] MNT: lint --- docs/source/explanations/structures.md | 2 +- docs/source/how-to/register.md | 2 +- tiled/_tests/test_consolidated.py | 41 +++++++++++++------------- tiled/catalog/adapter.py | 5 +--- tiled/client/consolidated.py | 4 +-- tiled/server/pydantic_consolidated.py | 18 +++++------ tiled/server/router.py | 2 +- tiled/structures/consolidated.py | 18 +++++------ 8 files changed, 45 insertions(+), 47 deletions(-) diff --git a/docs/source/explanations/structures.md b/docs/source/explanations/structures.md index f72ead1d9..89039231d 100644 --- a/docs/source/explanations/structures.md +++ b/docs/source/explanations/structures.md @@ -576,7 +576,7 @@ response. "count": 5 } ``` - + ### Consolidated This is a specialized container-like structure designed to link together multiple tables and arrays that store diff --git a/docs/source/how-to/register.md b/docs/source/how-to/register.md index 9b98e6d99..835ea1cb3 100644 --- a/docs/source/how-to/register.md +++ b/docs/source/how-to/register.md @@ -147,4 +147,4 @@ node = client.create_consolidated( # Write the data node.parts["table1"].write(df) node.parts["C"].write_block(arr, (0, 0)) -``` \ No newline at end of file +``` diff --git a/tiled/_tests/test_consolidated.py b/tiled/_tests/test_consolidated.py index f593b0886..4a332d281 100644 --- a/tiled/_tests/test_consolidated.py +++ b/tiled/_tests/test_consolidated.py @@ -1,18 +1,11 @@ -import math -import string -import warnings -from pathlib import Path - import numpy -import pytest import pandas import pandas.testing +import pytest +from ..catalog import in_memory from ..client import Context, from_context from ..server.app import build_app -from ..structures.core import StructureFamily -from ..catalog import in_memory - from ..structures.array import ArrayStructure from ..structures.core import StructureFamily from ..structures.data_source import DataSource @@ -21,15 +14,23 @@ rng = numpy.random.default_rng(12345) df1 = pandas.DataFrame({"A": ["one", "two", "three"], "B": [1, 2, 3]}) -df2 = pandas.DataFrame({"C": ["red", "green", "blue", "white"], "D": [10., 20., 30., 40.], "E": [0, 0, 0, 0]}) +df2 = pandas.DataFrame( + { + "C": ["red", "green", "blue", "white"], + "D": [10.0, 20.0, 30.0, 40.0], + "E": [0, 0, 0, 0], + } +) arr1 = rng.random(size=(3, 5), dtype="float64") arr2 = rng.integers(0, 255, size=(5, 7, 3), dtype="uint8") -md = {"md_key1":"md_val1", "md_key2":2} +md = {"md_key1": "md_val1", "md_key2": 2} + @pytest.fixture(scope="module") def tree(tmp_path_factory): return in_memory(writable_storage=tmp_path_factory.getbasetemp()) + @pytest.fixture(scope="module") def context(tree): with Context.from_app(build_app(tree)) as context: @@ -58,7 +59,7 @@ def context(tree): ), ], key="x", - metadata=md + metadata=md, ) # Write by data source. x.parts["table1"].write(df1) @@ -71,19 +72,19 @@ def context(tree): def test_iterate_parts(context): client = from_context(context) - for part in client['x'].parts: - client['x'].parts[part].read() + for part in client["x"].parts: + client["x"].parts[part].read() def test_iterate_columns(context): client = from_context(context) - for col in client['x']: - if col not in ('A', 'C'): - # TODO: reading string columns directly raises TypeError: Cannot interpret 'string[pyarrow]' as a data type - client['x'][col].read() - client[f'x/{col}'].read() + for col in client["x"]: + if col not in ("A", "C"): + # TODO: reading string columns raises TypeError: Cannot interpret 'string[pyarrow]' as a data type + client["x"][col].read() + client[f"x/{col}"].read() def test_metadata(context): client = from_context(context) - assert client['x'].metadata == md + assert client["x"].metadata == md diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py index a979b4ccd..007dd9d11 100644 --- a/tiled/catalog/adapter.py +++ b/tiled/catalog/adapter.py @@ -62,10 +62,7 @@ ZARR_MIMETYPE, ) from ..query_registration import QueryTranslationRegistry -from ..server.pydantic_consolidated import ( - ConsolidatedStructure, - ConsolidatedStructurePart, -) +from ..server.pydantic_consolidated import ConsolidatedStructure from ..server.pydantic_container import ContainerStructure from ..server.schemas import Asset, DataSource, Management, Revision, Spec from ..structures.core import StructureFamily diff --git a/tiled/client/consolidated.py b/tiled/client/consolidated.py index 3be5cfe29..38a08e228 100644 --- a/tiled/client/consolidated.py +++ b/tiled/client/consolidated.py @@ -46,7 +46,7 @@ def __getitem__(self, key): item, include_data_sources=self._include_data_sources, ) - + def __iter__(self): yield from self.structure().all_keys @@ -86,4 +86,4 @@ def __getitem__(self, name): def __iter__(self): for item in self.node.structure().parts: - yield item.name \ No newline at end of file + yield item.name diff --git a/tiled/server/pydantic_consolidated.py b/tiled/server/pydantic_consolidated.py index 2ad79ecb5..5f6c49d07 100644 --- a/tiled/server/pydantic_consolidated.py +++ b/tiled/server/pydantic_consolidated.py @@ -36,14 +36,14 @@ def from_data_sources(cls, data_sources): all_keys.extend(data_source.structure.columns) else: all_keys.append(data_source.name) - parts=[ - ConsolidatedStructurePart( - data_source_id=data_source.id, - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - for data_source in data_sources - ] + parts = [ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] return cls(parts=parts, all_keys=all_keys) diff --git a/tiled/server/router.py b/tiled/server/router.py index 9e1d111cc..b72b4bc3a 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -61,7 +61,7 @@ ) from .file_response_with_range import FileResponseWithRange from .links import links_for_node -from .pydantic_consolidated import ConsolidatedStructure, ConsolidatedStructurePart +from .pydantic_consolidated import ConsolidatedStructure from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing diff --git a/tiled/structures/consolidated.py b/tiled/structures/consolidated.py index 8a955f256..4e5a3bfc4 100644 --- a/tiled/structures/consolidated.py +++ b/tiled/structures/consolidated.py @@ -37,14 +37,14 @@ def from_data_sources(cls, data_sources): all_keys.extend(data_source.structure.columns) else: all_keys.append(data_source.name) - parts=[ - ConsolidatedStructurePart( - data_source_id=data_source.id, - structure=data_source.structure, - structure_family=data_source.structure_family, - name=data_source.name, - ) - for data_source in data_sources - ] + parts = [ + ConsolidatedStructurePart( + data_source_id=data_source.id, + structure=data_source.structure, + structure_family=data_source.structure_family, + name=data_source.name, + ) + for data_source in data_sources + ] return cls(parts=parts, all_keys=all_keys) From 04efd91d794796e6d1be5d4f8fcbd96a8ce463b8 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 16:25:10 -0500 Subject: [PATCH 12/13] MNT: typing --- tiled/adapters/parquet.py | 2 +- tiled/adapters/table.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tiled/adapters/parquet.py b/tiled/adapters/parquet.py index b8421396d..d7bf14304 100644 --- a/tiled/adapters/parquet.py +++ b/tiled/adapters/parquet.py @@ -166,5 +166,5 @@ def structure(self) -> TableStructure: """ return self._structure - def get(self, key: str) -> ArrayAdapter | None: + def get(self, key: str) -> Union[ArrayAdapter, None]: return self.dataframe_adapter.get(key) diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py index 8f13461c6..6b7ca2a40 100644 --- a/tiled/adapters/table.py +++ b/tiled/adapters/table.py @@ -169,7 +169,7 @@ def __getitem__(self, key: str) -> ArrayAdapter: # Must compute to determine shape. return ArrayAdapter.from_array(self.read([key])[key].values) - def get(self, key: str) -> ArrayAdapter | None: + def get(self, key: str) -> Union[ArrayAdapter, None]: if key not in self.structure().columns: return None return ArrayAdapter.from_array(self.read([key])[key].values) From c7ee9629402a5cf43b100b233be3b10fb4b637a9 Mon Sep 17 00:00:00 2001 From: Eugene M Date: Fri, 13 Dec 2024 19:33:13 -0500 Subject: [PATCH 13/13] FIX: reading string-dtype columns from dataframes individually --- tiled/_tests/test_consolidated.py | 6 ++---- tiled/_tests/test_dataframe.py | 22 ++++++++++++++++++++++ tiled/adapters/table.py | 24 ++++++++++-------------- tiled/structures/array.py | 2 +- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/tiled/_tests/test_consolidated.py b/tiled/_tests/test_consolidated.py index 4a332d281..103be7520 100644 --- a/tiled/_tests/test_consolidated.py +++ b/tiled/_tests/test_consolidated.py @@ -79,10 +79,8 @@ def test_iterate_parts(context): def test_iterate_columns(context): client = from_context(context) for col in client["x"]: - if col not in ("A", "C"): - # TODO: reading string columns raises TypeError: Cannot interpret 'string[pyarrow]' as a data type - client["x"][col].read() - client[f"x/{col}"].read() + client["x"][col].read() + client[f"x/{col}"].read() def test_metadata(context): diff --git a/tiled/_tests/test_dataframe.py b/tiled/_tests/test_dataframe.py index 1df2163bf..01570356b 100644 --- a/tiled/_tests/test_dataframe.py +++ b/tiled/_tests/test_dataframe.py @@ -41,6 +41,17 @@ pandas.DataFrame({f"column_{i:03d}": i * numpy.ones(5) for i in range(10)}), npartitions=1, ), + # a dataframe with mixed types + "diverse": DataFrameAdapter.from_pandas( + pandas.DataFrame( + { + "A": numpy.array([1, 2, 3], dtype="|u8"), + "B": numpy.array([1, 2, 3], dtype=" str: return f"{type(self).__name__}({self._structure.columns!r})" def __getitem__(self, key: str) -> ArrayAdapter: - """ + # Must compute to determine shape + array = self.read([key])[key].values - Parameters - ---------- - key : + # Convert (experimental) pandas.StringDtype to numpy's unicode string dtype + if isinstance(array.dtype, pandas.StringDtype): + import numpy - Returns - ------- + max_size = max((len(i) for i in array.ravel())) + array = array.astype(dtype=numpy.dtype(f" Union[ArrayAdapter, None]: if key not in self.structure().columns: return None - return ArrayAdapter.from_array(self.read([key])[key].values) + return self[key] def items(self) -> Iterator[Tuple[str, ArrayAdapter]]: - yield from ( - (key, ArrayAdapter.from_array(self.read([key])[key].values)) - for key in self._structure.columns - ) + yield from ((key, self[key]) for key in self._structure.columns) def metadata(self) -> JSON: """ diff --git a/tiled/structures/array.py b/tiled/structures/array.py index 53207b84e..901c58fc9 100644 --- a/tiled/structures/array.py +++ b/tiled/structures/array.py @@ -52,7 +52,7 @@ class Kind(str, enum.Enum): unicode = "U" # fixed-length sequence of Py_UNICODE other = "V" # "V" is for "void" -- generic fixed-size chunk of memory - # By default, do not tolerate numpy objectg arrays + # By default, do not tolerate numpy object arrays if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0": object = "O" # Object (i.e. the memory contains a pointer to PyObject)