Skip to content

Commit

Permalink
Fixes #440 -Normalize stream inputs as IO streams
Browse files Browse the repository at this point in the history
  • Loading branch information
claudep committed Jan 11, 2020
1 parent 6152d99 commit 660990b
Show file tree
Hide file tree
Showing 13 changed files with 86 additions and 41 deletions.
3 changes: 3 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@

- Formats can now be dynamically registered through the
`tablib.formats.registry.refister` API (#256).
- Tablib methods expecting data input (`detect_format`, `import_set`,
`Dataset.load`, `Databook.load`) now accepts file-like objects in addition
to raw strings and bytestrings (#440).

### Bugfixes

Expand Down
9 changes: 6 additions & 3 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,17 @@ Importing Data
--------------
Creating a :class:`tablib.Dataset` object by importing a pre-existing file is simple. ::

imported_data = Dataset().load(open('data.csv').read())
with open('data.csv', 'r') as fh:
imported_data = Dataset().load(fh)

This detects what sort of data is being passed in, and uses an appropriate formatter to do the import. So you can import from a variety of different file types.

.. admonition:: Source without headers

When the format is :class:`csv <Dataset.csv>`, :class:`tsv <Dataset.tsv>`, :class:`dbf <Dataset.dbf>`, :class:`xls <Dataset.xls>` or :class:`xlsx <Dataset.xlsx>`, and the data source does not have headers, the import should be done as follows ::

imported_data = Dataset().load(open('data.csv').read(), headers=False)
with open('data.csv', 'r') as fh:
imported_data = Dataset().load(fh, headers=False)

--------------
Exporting Data
Expand Down Expand Up @@ -320,7 +322,8 @@ Open an Excel Workbook and read first sheet
Open an Excel 2007 and later workbook with a single sheet (or a workbook with multiple sheets but you just want the first sheet). ::

data = tablib.Dataset()
data.xlsx = open('my_excel_file.xlsx', 'rb').read()
with open('my_excel_file.xlsx', 'rb') as fh:
data.load(fh, 'xlsx')
print(data)

Excel Workbook With Multiple Sheets
Expand Down
37 changes: 25 additions & 12 deletions src/tablib/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
UnsupportedFormat,
)
from tablib.formats import registry
from tablib.utils import normalize_input

__title__ = 'tablib'
__author__ = 'Kenneth Reitz'
Expand Down Expand Up @@ -239,8 +240,9 @@ def __str__(self):
def _get_in_format(self, fmt_key, **kwargs):
return registry.get_format(fmt_key).export_set(self, **kwargs)

def _set_in_format(self, fmt_key, *args, **kwargs):
return registry.get_format(fmt_key).import_set(self, *args, **kwargs)
def _set_in_format(self, fmt_key, in_stream, **kwargs):
in_stream = normalize_input(in_stream)
return registry.get_format(fmt_key).import_set(self, in_stream, **kwargs)

def _validate(self, row=None, col=None, safety=False):
"""Assures size of every row in dataset is of proper proportions."""
Expand Down Expand Up @@ -402,12 +404,14 @@ def width(self):
def load(self, in_stream, format=None, **kwargs):
"""
Import `in_stream` to the :class:`Dataset` object using the `format`.
`in_stream` can be a file-like object, a string, or a bytestring.
:param \\*\\*kwargs: (optional) custom configuration to the format `import_set`.
"""

stream = normalize_input(in_stream)
if not format:
format = detect_format(in_stream)
format = detect_format(stream)

fmt = registry.get_format(format)
if not hasattr(fmt, 'import_set'):
Expand All @@ -416,7 +420,7 @@ def load(self, in_stream, format=None, **kwargs):
if not import_set:
raise UnsupportedFormat('Format {} cannot be imported.'.format(format))

fmt.import_set(self, in_stream, **kwargs)
fmt.import_set(self, stream, **kwargs)
return self

def export(self, format, **kwargs):
Expand Down Expand Up @@ -861,18 +865,20 @@ def size(self):
def load(self, in_stream, format, **kwargs):
"""
Import `in_stream` to the :class:`Databook` object using the `format`.
`in_stream` can be a file-like object, a string, or a bytestring.
:param \\*\\*kwargs: (optional) custom configuration to the format `import_book`.
"""

stream = normalize_input(in_stream)
if not format:
format = detect_format(in_stream)
format = detect_format(stream)

fmt = registry.get_format(format)
if not hasattr(fmt, 'import_book'):
raise UnsupportedFormat('Format {} cannot be loaded.'.format(format))

fmt.import_book(self, in_stream, **kwargs)
fmt.import_book(self, stream, **kwargs)
return self

def export(self, format, **kwargs):
Expand All @@ -889,25 +895,32 @@ def export(self, format, **kwargs):


def detect_format(stream):
"""Return format name of given stream."""
"""Return format name of given stream (file-like object, string, or bytestring)."""
stream = normalize_input(stream)
fmt_title = None
for fmt in registry.formats():
try:
if fmt.detect(stream):
return fmt.title
fmt_title = fmt.title
break
except AttributeError:
pass
finally:
if hasattr(stream, 'seek'):
stream.seek(0)
return fmt_title


def import_set(stream, format=None, **kwargs):
"""Return dataset of given stream."""
"""Return dataset of given stream (file-like object, string, or bytestring)."""

return Dataset().load(stream, format, **kwargs)
return Dataset().load(normalize_input(stream), format, **kwargs)


def import_book(stream, format=None, **kwargs):
"""Return dataset of given stream."""
"""Return dataset of given stream (file-like object, string, or bytestring)."""

return Databook().load(stream, format, **kwargs)
return Databook().load(normalize_input(stream), format, **kwargs)


registry.register_builtins()
5 changes: 3 additions & 2 deletions src/tablib/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from importlib.util import find_spec

from tablib.exceptions import UnsupportedFormat
from tablib.utils import normalize_input

from ._csv import CSVFormat
from ._json import JSONFormat
Expand Down Expand Up @@ -52,7 +53,7 @@ def __get__(self, obj, cls, **kwargs):

def __set__(self, obj, val):
self.ensure_format_loaded()
return self._format.import_book(obj, val)
return self._format.import_book(obj, normalize_input(val))


class ImportExportSetDescriptor(FormatDescriptorBase):
Expand All @@ -62,7 +63,7 @@ def __get__(self, obj, cls, **kwargs):

def __set__(self, obj, val):
self.ensure_format_loaded()
return self._format.import_set(obj, val)
return self._format.import_set(obj, normalize_input(val))


class Registry:
Expand Down
4 changes: 2 additions & 2 deletions src/tablib/formats/_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def import_set(cls, dset, in_stream, headers=True, **kwargs):

kwargs.setdefault('delimiter', cls.DEFAULT_DELIMITER)

rows = csv.reader(StringIO(in_stream), **kwargs)
rows = csv.reader(in_stream, **kwargs)
for i, row in enumerate(rows):

if (i == 0) and (headers):
Expand All @@ -52,7 +52,7 @@ def import_set(cls, dset, in_stream, headers=True, **kwargs):
def detect(cls, stream, delimiter=None):
"""Returns True if given stream is valid CSV."""
try:
csv.Sniffer().sniff(stream[:1024], delimiters=delimiter or cls.DEFAULT_DELIMITER)
csv.Sniffer().sniff(stream.read(1024), delimiters=delimiter or cls.DEFAULT_DELIMITER)
return True
except Exception:
return False
7 changes: 2 additions & 5 deletions src/tablib/formats/_dbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def import_set(cls, dset, in_stream, headers=True):
"""Returns a dataset from a DBF stream."""

dset.wipe()
_dbf = dbf.Dbf(io.BytesIO(in_stream))
_dbf = dbf.Dbf(in_stream)
dset.headers = _dbf.fieldNames
for record in range(_dbf.recordCount):
row = [_dbf[record][f] for f in _dbf.fieldNames]
Expand All @@ -59,11 +59,8 @@ def import_set(cls, dset, in_stream, headers=True):
@classmethod
def detect(cls, stream):
"""Returns True if the given stream is valid DBF"""
#_dbf = dbf.Table(StringIO(stream))
try:
if type(stream) is not bytes:
stream = bytes(stream, 'utf-8')
_dbf = dbf.Dbf(io.BytesIO(stream), readOnly=True)
_dbf = dbf.Dbf(stream, readOnly=True)
return True
except Exception:
return False
4 changes: 3 additions & 1 deletion src/tablib/formats/_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ def detect(cls, stream):
"""Returns True if given stream is a DataFrame."""
if DataFrame is None:
return False
elif isinstance(stream, DataFrame):
return True
try:
DataFrame(stream)
DataFrame(stream.read())
return True
except ValueError:
return False
Expand Down
6 changes: 3 additions & 3 deletions src/tablib/formats/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ def import_set(cls, dset, in_stream):
"""Returns dataset from JSON stream."""

dset.wipe()
dset.dict = json.loads(in_stream)
dset.dict = json.load(in_stream)

@classmethod
def import_book(cls, dbook, in_stream):
"""Returns databook from JSON stream."""

dbook.wipe()
for sheet in json.loads(in_stream):
for sheet in json.load(in_stream):
data = tablib.Dataset()
data.title = sheet['title']
data.dict = sheet['data']
Expand All @@ -52,7 +52,7 @@ def import_book(cls, dbook, in_stream):
def detect(cls, stream):
"""Returns True if given stream is valid JSON."""
try:
json.loads(stream)
json.load(stream)
return True
except (TypeError, ValueError):
return False
2 changes: 1 addition & 1 deletion src/tablib/formats/_xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def import_set(cls, dset, in_stream, headers=True):

dset.wipe()

xls_book = xlrd.open_workbook(file_contents=in_stream)
xls_book = xlrd.open_workbook(file_contents=in_stream.read())
sheet = xls_book.sheet_by_index(0)

dset.title = sheet.name
Expand Down
7 changes: 2 additions & 5 deletions src/tablib/formats/_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ class XLSXFormat:
@classmethod
def detect(cls, stream):
"""Returns True if given stream is a readable excel file."""
if isinstance(stream, bytes):
# load_workbook expects a file-like object.
stream = BytesIO(stream)
try:
openpyxl.reader.excel.load_workbook(stream, read_only=True)
return True
Expand Down Expand Up @@ -63,7 +60,7 @@ def import_set(cls, dset, in_stream, headers=True):

dset.wipe()

xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True)
xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True)
sheet = xls_book.active

dset.title = sheet.title
Expand All @@ -81,7 +78,7 @@ def import_book(cls, dbook, in_stream, headers=True):

dbook.wipe()

xls_book = openpyxl.reader.excel.load_workbook(BytesIO(in_stream), read_only=True)
xls_book = openpyxl.reader.excel.load_workbook(in_stream, read_only=True)

for sheet in xls_book.worksheets:
data = tablib.Dataset()
Expand Down
13 changes: 13 additions & 0 deletions src/tablib/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from io import BytesIO, StringIO


def normalize_input(stream):
"""
Accept either a str/bytes stream or a file-like object and always return a
file-like object.
"""
if isinstance(stream, str):
return StringIO(stream)
elif isinstance(stream, bytes):
return BytesIO(stream)
return stream
Binary file added tests/files/founders.xlsx
Binary file not shown.
30 changes: 23 additions & 7 deletions tests/test_tablib.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pickle
import unittest
from collections import OrderedDict
from io import BytesIO, StringIO
from pathlib import Path
from uuid import uuid4

Expand Down Expand Up @@ -302,6 +303,18 @@ def test_book_unsupported_export(self):
with self.assertRaises(UnsupportedFormat):
book.export('csv')

def test_book_import_from_file(self):
xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx'
with open(str(xlsx_source), mode='rb') as fh:
book = tablib.Databook().load(fh, 'xlsx')
self.assertEqual(eval(book.json)[0]['title'], 'Feuille1')

def test_dataset_import_from_file(self):
xlsx_source = Path(__file__).parent / 'files' / 'founders.xlsx'
with open(str(xlsx_source), mode='rb') as fh:
dset = tablib.Dataset().load(fh, 'xlsx')
self.assertEqual(eval(dset.json)[0]['last_name'], 'Adams')

def test_auto_format_detect(self):
"""Test auto format detection."""
# html, jira, latex, rst are export only.
Expand Down Expand Up @@ -330,7 +343,9 @@ def test_auto_format_detect(self):
_tsv = '1\t2\t3\n4\t5\t6\n7\t8\t9\n'
self.assertEqual(tablib.detect_format(_tsv), 'tsv')

_bunk = '¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
_bunk = StringIO(
'¡¡¡¡¡¡---///\n\n\n¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)
self.assertEqual(tablib.detect_format(_bunk), None)

def test_transpose(self):
Expand Down Expand Up @@ -692,12 +707,12 @@ class CSVTests(BaseTestCase):
def test_csv_format_detect(self):
"""Test CSV format detection."""

_csv = (
_csv = StringIO(
'1,2,3\n'
'4,5,6\n'
'7,8,9\n'
)
_bunk = (
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)

Expand Down Expand Up @@ -915,12 +930,12 @@ def test_tsv_import_set(self):
def test_tsv_format_detect(self):
"""Test TSV format detection."""

_tsv = (
_tsv = StringIO(
'1\t2\t3\n'
'4\t5\t6\n'
'7\t8\t9\n'
)
_bunk = (
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)

Expand Down Expand Up @@ -999,8 +1014,8 @@ class JSONTests(BaseTestCase):
def test_json_format_detect(self):
"""Test JSON format detection."""

_json = '[{"last_name": "Adams","age": 90,"first_name": "John"}]'
_bunk = (
_json = StringIO('[{"last_name": "Adams","age": 90,"first_name": "John"}]')
_bunk = StringIO(
'¡¡¡¡¡¡¡¡£™∞¢£§∞§¶•¶ª∞¶•ªº••ª–º§•†•§º¶•†¥ª–º•§ƒø¥¨©πƒø†ˆ¥ç©¨√øˆ¥≈†ƒ¥ç©ø¨çˆ¥ƒçø¶'
)

Expand Down Expand Up @@ -1251,6 +1266,7 @@ def test_dbf_format_detect(self):
_dbf += b' Jefferson' + (b' ' * 70)
_dbf += b' 50.0000000'
_dbf += b'\x1a'
_dbf = BytesIO(_dbf)

_yaml = '- {age: 90, first_name: John, last_name: Adams}'
_tsv = 'foo\tbar'
Expand Down

0 comments on commit 660990b

Please sign in to comment.