Skip to content

Commit

Permalink
Improve tests
Browse files Browse the repository at this point in the history
  • Loading branch information
theroggy committed Jan 23, 2025
1 parent 41c9da6 commit f53af87
Showing 1 changed file with 92 additions and 69 deletions.
161 changes: 92 additions & 69 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
import geopandas as gp
import pandas as pd
from geopandas.array import from_wkt
from pandas.api.types import is_datetime64_dtype
from pandas.api.types import is_datetime64_dtype, is_object_dtype

import shapely # if geopandas is present, shapely is expected to be present
from shapely.geometry import Point
Expand Down Expand Up @@ -296,6 +296,35 @@ def test_read_datetime(datetime_file, use_arrow):
assert df.col.dtype.name == "datetime64[ns]"


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow):
"""Test writing/reading a column with naive datetimes (no timezone information)."""
dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00", None]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
dates = pd.to_datetime(dates_raw)
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
)

fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)

if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0):
# With GDAL < 3.11 with arrow, columns with naive datetimes are written
# correctly, but when read they are wrongly interpreted as being in UTC.
# The reason is complicated, but more info can be found e.g. here:
# https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807
assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC"))
pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow")

assert is_datetime64_dtype(result.dates.dtype)
assert_geodataframe_equal(result, df)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings("ignore: Non-conformant content for record 1 in column ")
@pytest.mark.requires_arrow_write_api
Expand All @@ -310,7 +339,7 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow):
# This was fixed in https://github.com/OSGeo/gdal/pull/11049
pytest.xfail("Wrong datetimes read in GeoJSON with GDAL < 3.10 via arrow")

dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00"]
dates_raw = ["2020-01-01T09:00:00.123-05:00", "2020-01-01T10:00:00-05:00", None]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
Expand All @@ -319,8 +348,8 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow):
# Make the index non-consecutive to test this case as well. Added for issue
# https://github.com/geopandas/pyogrio/issues/324
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]},
index=[0, 2],
{"dates": dates, "geometry": [Point(1, 1)] * 3},
index=[0, 2, 3],
crs="EPSG:4326",
)
assert isinstance(df.dates.dtype, pd.DatetimeTZDtype)
Expand All @@ -330,62 +359,77 @@ def test_write_read_datetime_tz(tmp_path, ext, use_arrow):
result = read_dataframe(fpath, use_arrow=use_arrow)

# With some older versions, the offset is represented slightly differently
if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"):
if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"):
result.dates = result.dates.astype(df.dates.dtype)

if use_arrow and __gdal_version__ < (3, 11, 0):
if ext in (".fgb", ".gpkg"):
# With GDAL < 3.11 with arrow, datetime columns are written as string type
# columns
df.dates = df.dates.map(lambda x: x.isoformat())
if use_arrow and ext in (".fgb", ".gpkg") and __gdal_version__ < (3, 11, 0):
# With GDAL < 3.11 with arrow, datetime columns are written as string type
df_exp = df.copy()
df_exp.dates = df_exp[df_exp.dates.notna()].dates.astype(str)
assert_series_equal(result.dates, df_exp.dates, check_index=False)
pytest.xfail("datetime columns written as string with GDAL < 3.11 via arrow")

assert isinstance(df.dates.dtype, pd.DatetimeTZDtype)
assert_series_equal(result.dates, df.dates, check_index=False)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_localized_mixed_offset(tmp_path, use_arrow):
def test_write_read_datetime_tz_localized_mixed_offset(tmp_path, ext, use_arrow):
"""Test with localized dates across a different summer/winter timezone offset."""
# Australian Summer Time AEDT (GMT+11), Standard Time AEST (GMT+10)
dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111"]
dates_raw = ["2023-01-01 11:00:01.111", "2023-06-01 10:00:01.111", None]
dates_naive = pd.Series(pd.to_datetime(dates_raw), name="dates")
dates_local = dates_naive.dt.tz_localize("Australia/Sydney")
dates_local_offsets_str = dates_local.map(pd.Timestamp.isoformat)
dates_local_offsets_str = dates_local.astype("string").astype("O")
dates_exp = dates_local_offsets_str.map(pd.Timestamp)

df = gp.GeoDataFrame(
{"dates": dates_local, "geometry": [Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
{"dates": dates_local, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
)
fpath = tmp_path / "test.gpkg"
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)

if use_arrow and __gdal_version__ < (3, 11, 0):
# With GDAL < 3.11 with arrow, datetime columns written as string type columns
dates_exp = dates_local_offsets_str
if ext in (".geojson", ".geojsonl"):
# With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC
# when read as the arrow datetime column type does not support mixed tz.
dates_utc = dates_local.dt.tz_convert("UTC")
if PANDAS_GE_20:
dates_utc = dates_utc.dt.as_unit("ms")
assert_series_equal(result.dates, dates_utc)
pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow")
elif ext in (".gpkg", ".fgb"):
# With GDAL < 3.11 with arrow, datetime columns written as string type
assert_series_equal(result.dates, dates_local_offsets_str)
pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow")

# GDAL tz only encodes offsets, not timezones
assert_series_equal(result["dates"], dates_exp)
assert is_object_dtype(result.dates.dtype)
assert_series_equal(result.dates, dates_exp)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.filterwarnings(
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow):
def test_write_read_datetime_tz_mixed_offsets(tmp_path, ext, use_arrow):
"""Test with dates with mixed timezone offsets."""
# Pandas datetime64 column types doesn't support mixed timezone offsets, so
# it needs to be a list of pandas.Timestamp objects instead.
dates_raw = ["2023-01-01 11:00:01.111+01:00", "2023-06-01 10:00:01.111+05:00"]
dates_ts = list(map(pd.Timestamp, dates_raw))
dates = [
pd.Timestamp("2023-01-01 11:00:01.111+01:00"),
pd.Timestamp("2023-06-01 10:00:01.111+05:00"),
None,
]

df = gp.GeoDataFrame(
{"dates": dates_ts, "geometry": [Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
)
fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
Expand All @@ -395,44 +439,21 @@ def test_write_read_datetime_mixed_offsets(tmp_path, ext, use_arrow):
if ext in (".geojson", ".geojsonl"):
# With GDAL < 3.11 with arrow, GDAL converts mixed timezone datetimes to UTC
# when read as the arrow datetime column type does not support mixed tz.
df_exp = df.copy()
df_exp.dates = pd.to_datetime(dates, utc=True)
if PANDAS_GE_20:
df.dates = pd.to_datetime(dates_ts, utc=True).as_unit("ms")
else:
df.dates = pd.to_datetime(dates_ts, utc=True)
df_exp.dates = df_exp.dates.dt.as_unit("ms")
assert_geodataframe_equal(result, df_exp)
pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow")
elif ext in (".gpkg", ".fgb"):
# With arrow and GDAL < 3.11, mixed timezone datetimes are written as string
# type columns, so no proper roundtrip possible.
df.dates = df.dates.map(pd.Timestamp.isoformat)
df_exp = df.copy()
df_exp.dates = df_exp.dates.astype("string").astype("O")
assert_geodataframe_equal(result, df_exp)
pytest.xfail("mixed tz datetimes converted to UTC with GDAL < 3.11 + arrow")

assert_geodataframe_equal(result, df)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow):
"""Test writing/reading a column with naive datetimes (no timezone information)."""
dates_raw = ["2020-01-01 09:00:00.123", "2020-01-01 10:00:00"]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
dates = pd.to_datetime(dates_raw)
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326"
)

fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
result = read_dataframe(fpath, use_arrow=use_arrow)

if use_arrow and ext == ".gpkg" and __gdal_version__ < (3, 11, 0):
# With GDAL < 3.11 with arrow, columns with naive datetimes are written
# correctly, but when read they are wrongly interpreted as being in UTC.
# The reason is complicated, but more info can be found e.g. here:
# https://github.com/geopandas/pyogrio/issues/487#issuecomment-2423762807
assert_series_equal(result.dates, df.dates.dt.tz_localize("UTC"))
pytest.xfail("naive datetimes read wrong in GPKG with GDAL < 3.11 via arrow")

assert is_datetime64_dtype(result.dates.dtype)
assert is_object_dtype(result.dates.dtype)
assert_geodataframe_equal(result, df)


Expand All @@ -456,8 +477,8 @@ def test_write_read_datetime_no_tz(tmp_path, ext, use_arrow):
"ignore: Non-conformant content for record 1 in column dates"
)
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_arrow):
"""Datetime objects with null values and the equal offset are read as datetime64."""
def test_write_read_datetime_tz_objects(tmp_path, dates_raw, ext, use_arrow):
"""Datetime objects with equal offsets are read as datetime64."""
if use_arrow and __gdal_version__ < (3, 10, 0) and ext in (".geojson", ".geojsonl"):
# With GDAL < 3.10 with arrow, the timezone offset was applied to the datetime
# as well as retaining the timezone.
Expand All @@ -466,9 +487,9 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar

dates = pd.Series(dates_raw, dtype="O")
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1), Point(1, 1)]},
crs="EPSG:4326",
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
)

if PANDAS_GE_20:
dates_exp = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
Expand All @@ -481,32 +502,34 @@ def test_write_read_datetime_objects_with_nulls(tmp_path, dates_raw, ext, use_ar
result = read_dataframe(fpath, use_arrow=use_arrow)

# With some older versions, the offset is represented slightly differently
if str(result.dates.dtype).endswith(", pytz.FixedOffset(-300)]"):
if result.dates.dtype.name.endswith(", pytz.FixedOffset(-300)]"):
result.dates = result.dates.astype(exp_df.dates.dtype)

if use_arrow and __gdal_version__ < (3, 11, 0):
if ext in (".fgb", ".gpkg"):
# With GDAL < 3.11 with arrow, datetime columns are written as string type
# columns
exp_df.dates = exp_df.dates.map(
lambda x: x.isoformat() if x is not pd.NaT else None
)
exp2_df = exp_df.copy()
exp2_df.dates = exp2_df.dates.astype("string").astype("O")
assert_geodataframe_equal(result, exp2_df)
pytest.xfail("datetime columns written as string with GDAL < 3.11 + arrow")

assert isinstance(result.dates.dtype, pd.DatetimeTZDtype)
assert_geodataframe_equal(result, exp_df)


@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext != ".shp"])
@pytest.mark.requires_arrow_write_api
def test_write_read_datetime_utc(tmp_path, ext, use_arrow):
"""Test writing/reading a column with UTC datetimes."""
dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z"]
dates_raw = ["2020-01-01 09:00:00.123Z", "2020-01-01 10:00:00Z", None]
if PANDAS_GE_20:
dates = pd.to_datetime(dates_raw, format="ISO8601").as_unit("ms")
else:
dates = pd.to_datetime(dates_raw)
df = gp.GeoDataFrame(
{"dates": dates, "geometry": [Point(1, 1), Point(1, 1)]}, crs="EPSG:4326"
{"dates": dates, "geometry": [Point(1, 1)] * 3}, crs="EPSG:4326"
)
assert df.dates.dtype.name == "datetime64[ms, UTC]"

fpath = tmp_path / f"test{ext}"
write_dataframe(df, fpath, use_arrow=use_arrow)
Expand All @@ -517,7 +540,7 @@ def test_write_read_datetime_utc(tmp_path, ext, use_arrow):
assert_series_equal(result.dates, df.dates.dt.tz_localize(None))
pytest.xfail("UTC datetimes read wrong in .fgb with GDAL < 3.11 via arrow")

assert str(result.dates.dtype) == "datetime64[ms, UTC]"
assert result.dates.dtype.name == "datetime64[ms, UTC]"
assert_geodataframe_equal(result, df)


Expand Down

0 comments on commit f53af87

Please sign in to comment.