Skip to content

Commit

Permalink
FIX: reading string-dtype columns from dataframes individually
Browse files Browse the repository at this point in the history
  • Loading branch information
genematx committed Dec 14, 2024
1 parent 04efd91 commit c7ee962
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
6 changes: 2 additions & 4 deletions tiled/_tests/test_consolidated.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,8 @@ def test_iterate_parts(context):
def test_iterate_columns(context):
client = from_context(context)
for col in client["x"]:
if col not in ("A", "C"):
# TODO: reading string columns raises TypeError: Cannot interpret 'string[pyarrow]' as a data type
client["x"][col].read()
client[f"x/{col}"].read()
client["x"][col].read()
client[f"x/{col}"].read()


def test_metadata(context):
Expand Down
22 changes: 22 additions & 0 deletions tiled/_tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,17 @@
pandas.DataFrame({f"column_{i:03d}": i * numpy.ones(5) for i in range(10)}),
npartitions=1,
),
# a dataframe with mixed types
"diverse": DataFrameAdapter.from_pandas(
pandas.DataFrame(
{
"A": numpy.array([1, 2, 3], dtype="|u8"),
"B": numpy.array([1, 2, 3], dtype="<f8"),
"C": ["one", "two", "three"],
}
),
npartitions=1,
),
}
)

Expand Down Expand Up @@ -100,6 +111,17 @@ def test_dataframe_single_partition(context):
pandas.testing.assert_frame_equal(actual, expected)


def test_reading_diverse_dtypes(context):
client = from_context(context)
expected = tree["diverse"].read()
actual = client["diverse"].read()
pandas.testing.assert_frame_equal(actual, expected)

for col in expected.columns:
actual = client["diverse"][col].read()
assert numpy.array_equal(expected[col], actual)


def test_dask(context):
client = from_context(context, "dask")["basic"]
expected = tree["basic"].read()
Expand Down
24 changes: 10 additions & 14 deletions tiled/adapters/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,29 +156,25 @@ def __repr__(self) -> str:
return f"{type(self).__name__}({self._structure.columns!r})"

def __getitem__(self, key: str) -> ArrayAdapter:
"""
# Must compute to determine shape
array = self.read([key])[key].values

Parameters
----------
key :
# Convert (experimental) pandas.StringDtype to numpy's unicode string dtype
if isinstance(array.dtype, pandas.StringDtype):
import numpy

Returns
-------
max_size = max((len(i) for i in array.ravel()))
array = array.astype(dtype=numpy.dtype(f"<U{max_size}"))

"""
# Must compute to determine shape.
return ArrayAdapter.from_array(self.read([key])[key].values)
return ArrayAdapter.from_array(array)

def get(self, key: str) -> Union[ArrayAdapter, None]:
if key not in self.structure().columns:
return None
return ArrayAdapter.from_array(self.read([key])[key].values)
return self[key]

def items(self) -> Iterator[Tuple[str, ArrayAdapter]]:
yield from (
(key, ArrayAdapter.from_array(self.read([key])[key].values))
for key in self._structure.columns
)
yield from ((key, self[key]) for key in self._structure.columns)

def metadata(self) -> JSON:
"""
Expand Down
2 changes: 1 addition & 1 deletion tiled/structures/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class Kind(str, enum.Enum):
unicode = "U" # fixed-length sequence of Py_UNICODE
other = "V" # "V" is for "void" -- generic fixed-size chunk of memory

# By default, do not tolerate numpy objectg arrays
# By default, do not tolerate numpy object arrays
if os.getenv("TILED_ALLOW_OBJECT_ARRAYS", "0") != "0":
object = "O" # Object (i.e. the memory contains a pointer to PyObject)

Expand Down

0 comments on commit c7ee962

Please sign in to comment.