Skip to content

Commit

Permalink
fix: auto-detect encoding for metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
davinov committed Apr 11, 2022
1 parent b18445d commit a1b783f
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
9 changes: 9 additions & 0 deletions peakina/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ def get_metadata(self) -> Dict[str, Any]:
return {} # no metadata for matched datasources
with self.fetcher.open(self.uri) as f:
assert self.type is not None

allowed_params = get_reader_allowed_params(self.type)
# Auto-detect encoding if not present
encoding = self.reader_kwargs.get("encoding")
if "encoding" in allowed_params:
if not validate_encoding(f.name, encoding):
encoding = detect_encoding(f.name)
self.reader_kwargs["encoding"] = encoding

return get_metadata(f.name, self.type, self.reader_kwargs)

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions tests/test_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ def test_csv_western_encoding(path):
df_meta = ds.get_metadata()
assert df_meta == {"df_rows": 2, "total_rows": 2}

# Encoding auto-detection
ds = DataSource(path("encoded_western_short.csv"))
df = ds.get_df()
assert df.shape == (2, 19)
df_meta = ds.get_metadata()
assert df_meta == {"df_rows": 2, "total_rows": 2}


def test_csv_header_row(path):
"""
Expand Down

0 comments on commit a1b783f

Please sign in to comment.