FIX: reading string-dtype columns from dataframes individually

bluesky · Dec 14, 2024 · c7ee962 · c7ee962
1 parent 04efd91
commit c7ee962
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 19 deletions.
diff --git a/tiled/_tests/test_consolidated.py b/tiled/_tests/test_consolidated.py
@@ -79,10 +79,8 @@ def test_iterate_parts(context):
 def test_iterate_columns(context):
     client = from_context(context)
     for col in client["x"]:
-        if col not in ("A", "C"):
-            # TODO: reading string columns raises TypeError: Cannot interpret 'string[pyarrow]' as a data type
-            client["x"][col].read()
-            client[f"x/{col}"].read()
+        client["x"][col].read()
+        client[f"x/{col}"].read()
 
 
 def test_metadata(context):

diff --git a/tiled/_tests/test_dataframe.py b/tiled/_tests/test_dataframe.py
@@ -41,6 +41,17 @@
             pandas.DataFrame({f"column_{i:03d}": i * numpy.ones(5) for i in range(10)}),
             npartitions=1,
         ),
+        # a dataframe with mixed types
+        "diverse": DataFrameAdapter.from_pandas(
+            pandas.DataFrame(
+                {
+                    "A": numpy.array([1, 2, 3], dtype="|u8"),
+                    "B": numpy.array([1, 2, 3], dtype="<f8"),
+                    "C": ["one", "two", "three"],
+                }
+            ),
+            npartitions=1,
+        ),
     }
 )
 
@@ -100,6 +111,17 @@ def test_dataframe_single_partition(context):
     pandas.testing.assert_frame_equal(actual, expected)
 
 
+def test_reading_diverse_dtypes(context):
+    client = from_context(context)
+    expected = tree["diverse"].read()
+    actual = client["diverse"].read()
+    pandas.testing.assert_frame_equal(actual, expected)
+
+    for col in expected.columns:
+        actual = client["diverse"][col].read()
+        assert numpy.array_equal(expected[col], actual)
+
+
 def test_dask(context):
     client = from_context(context, "dask")["basic"]
     expected = tree["basic"].read()

diff --git a/tiled/adapters/table.py b/tiled/adapters/table.py
@@ -156,29 +156,25 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}({self._structure.columns!r})"
 
     def __getitem__(self, key: str) -> ArrayAdapter:
-        """
+        # Must compute to determine shape
+        array = self.read([key])[key].values
 
-        Parameters
-        ----------
-        key :
+        # Convert (experimental) pandas.StringDtype to numpy's unicode string dtype
+        if isinstance(array.dtype, pandas.StringDtype):
+            import numpy
 
-        Returns
-        -------
+            max_size = max((len(i) for i in array.ravel()))
+            array = array.astype(dtype=numpy.dtype(f"<U{max_size}"))
 
-        """
-        # Must compute to determine shape.
-        return ArrayAdapter.from_array(self.read([key])[key].values)
+        return ArrayAdapter.from_array(array)
 
     def get(self, key: str) -> Union[ArrayAdapter, None]:
         if key not in self.structure().columns:
             return None
-        return ArrayAdapter.from_array(self.read([key])[key].values)
+        return self[key]
 
     def items(self) -> Iterator[Tuple[str, ArrayAdapter]]:
-        yield from (
-            (key, ArrayAdapter.from_array(self.read([key])[key].values))
-            for key in self._structure.columns
-        )
+        yield from ((key, self[key]) for key in self._structure.columns)
 
     def metadata(self) -> JSON:
         """

diff --git a/tiled/structures/array.py b/tiled/structures/array.py
@@ -52,7 +52,7 @@ class Kind(str, enum.Enum):
     unicode = "U"  # fixed-length sequence of Py_UNICODE
     other = "V"  # "V" is for "void" -- generic fixed-size chunk of memory
 
-    # By default, do not tolerate numpy objectg arrays
+    # By default, do not tolerate numpy object arrays
     if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0":
         object = "O"  # Object (i.e. the memory contains a pointer to PyObject)