Skip to content

Commit

Permalink
ENH: handle zip file. pass test suite
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Oct 20, 2011
1 parent 4a218da commit 9472428
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 20 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ pandas/src/tseries.c
pandas/src/sparse.c
pandas/version.py
doc/source/generated
doc/source/_static
*flymake*
scikits
.coverage
5 changes: 5 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ feedback on the library.
- Add more helpful error message when importing pandas post-installation from
the source directory (GH #250)


**Bug fixes**

- Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should
Expand Down Expand Up @@ -165,6 +166,8 @@ feedback on the library.
- Passing column names should force `header=None` (GH #257)
- Don't modify passed column names when `index_col` is not
None (GH #258)
- Can sniff CSV separator in zip file (since seek is not supported, was
failing before)

Thanks
------
Expand Down Expand Up @@ -291,6 +294,8 @@ infrastructure are the main new additions
retrieve groups
- Added informative Exception when passing dict to DataFrame groupby
aggregation with axis != 0
- Significantly speed up DataFrame `__repr__` and `count` on large mixed-type
DataFrame objects

**API Changes**

Expand Down
35 changes: 17 additions & 18 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Module contains tools for processing files into DataFrames or other objects
"""

from StringIO import StringIO

import numpy as np

from pandas.core.index import Index, MultiIndex
Expand Down Expand Up @@ -31,10 +33,12 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
dia.delimiter = sep
# attempt to sniff the delimiter
if sniff_sep:
sample = f.readline()
sniffed = csv.Sniffer().sniff(sample)
line = f.readline()
sniffed = csv.Sniffer().sniff(line)
dia.delimiter = sniffed.delimiter
f.seek(0)
buf = list(csv.reader(StringIO(line), dialect=dia))
else:
buf = []

reader = csv.reader(f, dialect=dia)

Expand All @@ -46,7 +50,7 @@ def read_csv(filepath_or_buffer, sep=None, header=0, index_col=None, names=None,
parse_dates=parse_dates,
date_parser=date_parser,
skiprows=skiprows,
chunksize=chunksize)
chunksize=chunksize, buf=buf)

if nrows is not None:
return parser.get_chunk(nrows)
Expand Down Expand Up @@ -144,17 +148,18 @@ class TextParser(object):

def __init__(self, data, names=None, header=0, index_col=None,
na_values=None, parse_dates=False, date_parser=None,
chunksize=None, skiprows=None):
chunksize=None, skiprows=None, buf=None):
"""
Workhorse function for processing nested list into DataFrame
Should be replaced by np.genfromtxt eventually?
"""
self.data = data

self.buf = []
# can pass rows read so far
self.buf = [] if buf is None else buf
self.pos = len(self.buf)

self.pos = 0
self.names = list(names) if names is not None else names
self.header = header
self.index_col = index_col
Expand All @@ -179,7 +184,10 @@ def _infer_columns(self):
self.header = None

if self.header is not None:
line = self._next_line()
if len(self.buf) > 0:
line = self.buf[0]
else:
line = self._next_line()
while self.header > self.pos:
line = self._next_line()

Expand All @@ -196,17 +204,16 @@ def _infer_columns(self):
if cur_count > 0:
columns[i] = '%s.%d' % (col, cur_count)
counts[col] = cur_count + 1
self._clear_buffer()
else:
line = self._next_line()
self.buf.append(line)

ncols = len(line)
if not names:
columns = ['X.%d' % (i + 1) for i in range(ncols)]
else:
columns = names

self._clear_buffer()

return columns

Expand Down Expand Up @@ -435,16 +442,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
datemode = self.book.datemode
sheet = self.book.sheet_by_name(sheetname)

if skiprows is None:
skiprows = set()
else:
skiprows = set(skiprows)

data = []
for i in range(sheet.nrows):
if i in skiprows:
continue

row = []
for value, typ in zip(sheet.row_values(i), sheet.row_types(i)):
if typ == XL_CELL_DATE:
Expand Down
4 changes: 2 additions & 2 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ def test_header_with_index_col(self):

self.assertEqual(names, ['A', 'B', 'C'])

data = [[1,2,3],[4,5,6],[7,8,9]]
expected = DataFrame(data, index=['foo','bar','baz'],
values = [[1,2,3],[4,5,6],[7,8,9]]
expected = DataFrame(values, index=['foo','bar','baz'],
columns=['A','B','C'])
assert_frame_equal(df, expected)

Expand Down

0 comments on commit 9472428

Please sign in to comment.