Skip to content

Commit

Permalink
win32 paths cannot be turned into URLs by prefixing them with "file://"
Browse files Browse the repository at this point in the history
  • Loading branch information
mindw committed Oct 28, 2013
1 parent 1695320 commit d20961a
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 10 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ pandas/io/*.json
.pydevproject
.settings
.idea
*.pdb
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,8 @@ Bug Fixes
- Bug when renaming then set_index on a DataFrame (:issue:`5344`)
- Test suite no longer leaves around temporary files when testing graphics. (:issue:`5347`)
(thanks for catching this @yarikoptic!)
- Fixed html tests on win32. (:issue:`4580`)


pandas 0.12.0
-------------
Expand Down
23 changes: 19 additions & 4 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@


if compat.PY3:
from urllib.request import urlopen
from urllib.request import urlopen, pathname2url
_urlopen = urlopen
from urllib.parse import urlparse as parse_url
import urllib.parse as compat_parse
from urllib.parse import uses_relative, uses_netloc, uses_params, urlencode
from urllib.parse import uses_relative, uses_netloc, uses_params, urlencode, urljoin
from urllib.error import URLError
from http.client import HTTPException
else:
from urllib2 import urlopen as _urlopen
from urllib import urlencode
from urllib import urlencode, pathname2url
from urlparse import urlparse as parse_url
from urlparse import uses_relative, uses_netloc, uses_params
from urlparse import uses_relative, uses_netloc, uses_params, urljoin
from urllib2 import URLError
from httplib import HTTPException
from contextlib import contextmanager, closing
Expand Down Expand Up @@ -134,6 +134,21 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
return filepath_or_buffer, None


def file_path_to_url(path):
"""
converts an absolute native path to a FILE URL.
Parameters
----------
path : a path in native format
Returns
-------
a valid FILE URL
"""
return urljoin('file:', pathname2url(path))


# ZipFile is not a context manager for <= 2.6
# must be tuple index here since 2.6 doesn't use namedtuple for version_info
if sys.version_info[1] <= 6:
Expand Down
12 changes: 6 additions & 6 deletions pandas/io/tests/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index,
date_range, Series)
from pandas.compat import map, zip, StringIO, string_types
from pandas.io.common import URLError, urlopen
from pandas.io.common import URLError, urlopen, file_path_to_url
from pandas.io.html import read_html

import pandas.util.testing as tm
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_invalid_url(self):
@slow
def test_file_url(self):
url = self.banklist_data
dfs = self.read_html('file://' + url, 'First', attrs={'id': 'table'})
dfs = self.read_html(file_path_to_url(url), 'First', attrs={'id': 'table'})
tm.assert_isinstance(dfs, list)
for df in dfs:
tm.assert_isinstance(df, DataFrame)
Expand Down Expand Up @@ -362,7 +362,7 @@ def test_multiindex_header_index_skiprows(self):
@slow
def test_regex_idempotency(self):
url = self.banklist_data
dfs = self.read_html('file://' + url,
dfs = self.read_html(file_path_to_url(url),
match=re.compile(re.compile('Florida')),
attrs={'id': 'table'})
tm.assert_isinstance(dfs, list)
Expand Down Expand Up @@ -637,9 +637,9 @@ def test_invalid_flavor():
flavor='not a* valid**++ flaver')


def get_elements_from_url(url, element='table', base_url="file://"):
def get_elements_from_file(url, element='table'):
_skip_if_none_of(('bs4', 'html5lib'))
url = "".join([base_url, url])
url = file_path_to_url(url)
from bs4 import BeautifulSoup
with urlopen(url) as f:
soup = BeautifulSoup(f, features='html5lib')
Expand All @@ -651,7 +651,7 @@ def test_bs4_finds_tables():
filepath = os.path.join(DATA_PATH, "spam.html")
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
assert get_elements_from_url(filepath, 'table')
assert get_elements_from_file(filepath, 'table')


def get_lxml_elements(url, element):
Expand Down

0 comments on commit d20961a

Please sign in to comment.