From beb80d11ec0ddaf00a97f8a38ec9eae68e07c28e Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 9 Jan 2024 23:52:15 +0000 Subject: [PATCH] GH-113528: Deoptimise `pathlib._abc.PurePathBase` (#113559) Apply pathlib's normalization and performance tuning in `pathlib.PurePath`, but not `pathlib._abc.PurePathBase`. With this change, the pathlib ABCs do not normalize away alternate path separators, empty segments, or dot segments. A single string given to the initialiser will round-trip by default, i.e. `str(PurePathBase(my_string)) == my_string`. Implementors can set their own path domain-specific normalization scheme by overriding `__str__()` Eliminating path normalization makes maintaining and caching the path's parts and string representation both optional and not very useful, so this commit moves the `_drv`, `_root`, `_tail_cached` and `_str` slots from `PurePathBase` to `PurePath`. Only `_raw_paths` and `_resolving` slots remain in `PurePathBase`. This frees the ABCs from the burden of some of pathlib's hardest-to-understand code. --- Lib/pathlib/__init__.py | 104 +++++++++++++++- Lib/pathlib/_abc.py | 142 +++++----------------- Lib/test/test_pathlib/test_pathlib.py | 41 +++++++ Lib/test/test_pathlib/test_pathlib_abc.py | 48 ++++---- 4 files changed, 195 insertions(+), 140 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index ccdd9c3d547c8e..9d3fcd894164e5 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -76,6 +76,20 @@ class PurePath(_abc.PurePathBase): """ __slots__ = ( + # The `_drv`, `_root` and `_tail_cached` slots store parsed and + # normalized parts of the path. They are set when any of the `drive`, + # `root` or `_tail` properties are accessed for the first time. The + # three-part division corresponds to the result of + # `os.path.splitroot()`, except that the tail is further split on path + # separators (i.e. it is a list of strings), and that the root and + # tail are normalized. + '_drv', '_root', '_tail_cached', + + # The `_str` slot stores the string representation of the path, + # computed from the drive, root and tail when `__str__()` is called + # for the first time. It's used to implement `_str_normcase` + '_str', + # The `_str_normcase_cached` slot stores the string path with # normalized case. It is set when the `_str_normcase` property is # accessed for the first time. It's used to implement `__eq__()` @@ -196,6 +210,94 @@ def __ge__(self, other): return NotImplemented return self._parts_normcase >= other._parts_normcase + def __str__(self): + """Return the string representation of the path, suitable for + passing to system calls.""" + try: + return self._str + except AttributeError: + self._str = self._format_parsed_parts(self.drive, self.root, + self._tail) or '.' + return self._str + + @classmethod + def _format_parsed_parts(cls, drv, root, tail): + if drv or root: + return drv + root + cls.pathmod.sep.join(tail) + elif tail and cls.pathmod.splitdrive(tail[0])[0]: + tail = ['.'] + tail + return cls.pathmod.sep.join(tail) + + def _from_parsed_parts(self, drv, root, tail): + path_str = self._format_parsed_parts(drv, root, tail) + path = self.with_segments(path_str) + path._str = path_str or '.' + path._drv = drv + path._root = root + path._tail_cached = tail + return path + + @classmethod + def _parse_path(cls, path): + if not path: + return '', '', [] + sep = cls.pathmod.sep + altsep = cls.pathmod.altsep + if altsep: + path = path.replace(altsep, sep) + drv, root, rel = cls.pathmod.splitroot(path) + if not root and drv.startswith(sep) and not drv.endswith(sep): + drv_parts = drv.split(sep) + if len(drv_parts) == 4 and drv_parts[2] not in '?.': + # e.g. //server/share + root = sep + elif len(drv_parts) == 6: + # e.g. //?/unc/server/share + root = sep + parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.'] + return drv, root, parsed + + def _load_parts(self): + paths = self._raw_paths + if len(paths) == 0: + path = '' + elif len(paths) == 1: + path = paths[0] + else: + path = self.pathmod.join(*paths) + self._drv, self._root, self._tail_cached = self._parse_path(path) + + @property + def drive(self): + """The drive prefix (letter or UNC path), if any.""" + try: + return self._drv + except AttributeError: + self._load_parts() + return self._drv + + @property + def root(self): + """The root of the path, if any.""" + try: + return self._root + except AttributeError: + self._load_parts() + return self._root + + @property + def _tail(self): + try: + return self._tail_cached + except AttributeError: + self._load_parts() + return self._tail_cached + + @property + def anchor(self): + """The concatenation of the drive and root, or ''.""" + return self.drive + self.root + @property def parts(self): """An object providing sequence-like access to the @@ -416,7 +518,7 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _make_child_entry(self, entry): + def _make_child_entry(self, entry, is_dir=False): # Transform an entry yielded from _scandir() into a path object. path_str = entry.name if str(self) == '.' else entry.path path = self.with_segments(path_str) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 5caad3c0502399..6a1928495c9775 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -1,7 +1,6 @@ import functools import ntpath import posixpath -import sys from errno import ENOENT, ENOTDIR, EBADF, ELOOP, EINVAL from stat import S_ISDIR, S_ISLNK, S_ISREG, S_ISSOCK, S_ISBLK, S_ISCHR, S_ISFIFO @@ -82,7 +81,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): except OSError: continue if match(entry.name): - yield parent_path._make_child_entry(entry) + yield parent_path._make_child_entry(entry, dir_only) def _select_recursive(parent_paths, dir_only, follow_symlinks): @@ -105,7 +104,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks): for entry in entries: try: if entry.is_dir(follow_symlinks=follow_symlinks): - paths.append(path._make_child_entry(entry)) + paths.append(path._make_child_entry(entry, dir_only)) continue except OSError: pass @@ -147,20 +146,6 @@ class PurePathBase: # in the `__init__()` method. '_raw_paths', - # The `_drv`, `_root` and `_tail_cached` slots store parsed and - # normalized parts of the path. They are set when any of the `drive`, - # `root` or `_tail` properties are accessed for the first time. The - # three-part division corresponds to the result of - # `os.path.splitroot()`, except that the tail is further split on path - # separators (i.e. it is a list of strings), and that the root and - # tail are normalized. - '_drv', '_root', '_tail_cached', - - # The `_str` slot stores the string representation of the path, - # computed from the drive, root and tail when `__str__()` is called - # for the first time. It's used to implement `_str_normcase` - '_str', - # The '_resolving' slot stores a boolean indicating whether the path # is being processed by `PathBase.resolve()`. This prevents duplicate # work from occurring when `resolve()` calls `stat()` or `readlink()`. @@ -179,65 +164,16 @@ def with_segments(self, *pathsegments): """ return type(self)(*pathsegments) - @classmethod - def _parse_path(cls, path): - if not path: - return '', '', [] - sep = cls.pathmod.sep - altsep = cls.pathmod.altsep - if altsep: - path = path.replace(altsep, sep) - drv, root, rel = cls.pathmod.splitroot(path) - if not root and drv.startswith(sep) and not drv.endswith(sep): - drv_parts = drv.split(sep) - if len(drv_parts) == 4 and drv_parts[2] not in '?.': - # e.g. //server/share - root = sep - elif len(drv_parts) == 6: - # e.g. //?/unc/server/share - root = sep - parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.'] - return drv, root, parsed - - def _load_parts(self): - paths = self._raw_paths - if len(paths) == 0: - path = '' - elif len(paths) == 1: - path = paths[0] - else: - path = self.pathmod.join(*paths) - drv, root, tail = self._parse_path(path) - self._drv = drv - self._root = root - self._tail_cached = tail - - def _from_parsed_parts(self, drv, root, tail): - path_str = self._format_parsed_parts(drv, root, tail) - path = self.with_segments(path_str) - path._str = path_str or '.' - path._drv = drv - path._root = root - path._tail_cached = tail - return path - - @classmethod - def _format_parsed_parts(cls, drv, root, tail): - if drv or root: - return drv + root + cls.pathmod.sep.join(tail) - elif tail and cls.pathmod.splitdrive(tail[0])[0]: - tail = ['.'] + tail - return cls.pathmod.sep.join(tail) - def __str__(self): """Return the string representation of the path, suitable for passing to system calls.""" - try: - return self._str - except AttributeError: - self._str = self._format_parsed_parts(self.drive, self.root, - self._tail) or '.' - return self._str + paths = self._raw_paths + if len(paths) == 1: + return paths[0] + elif paths: + return self.pathmod.join(*paths) + else: + return '' def as_posix(self): """Return the string representation of the path with forward (/) @@ -247,42 +183,23 @@ def as_posix(self): @property def drive(self): """The drive prefix (letter or UNC path), if any.""" - try: - return self._drv - except AttributeError: - self._load_parts() - return self._drv + return self.pathmod.splitdrive(str(self))[0] @property def root(self): """The root of the path, if any.""" - try: - return self._root - except AttributeError: - self._load_parts() - return self._root - - @property - def _tail(self): - try: - return self._tail_cached - except AttributeError: - self._load_parts() - return self._tail_cached + return self.pathmod.splitroot(str(self))[1] @property def anchor(self): """The concatenation of the drive and root, or ''.""" - anchor = self.drive + self.root - return anchor + drive, root, _ = self.pathmod.splitroot(str(self)) + return drive + root @property def name(self): """The final path component, if any.""" - path_str = str(self) - if not path_str or path_str == '.': - return '' - return self.pathmod.basename(path_str) + return self.pathmod.basename(str(self)) @property def suffix(self): @@ -323,13 +240,10 @@ def stem(self): def with_name(self, name): """Return a new path with the file name changed.""" - m = self.pathmod - if not name or m.sep in name or (m.altsep and m.altsep in name) or name == '.': + dirname = self.pathmod.dirname + if dirname(name): raise ValueError(f"Invalid name {name!r}") - parent, old_name = m.split(str(self)) - if not old_name or old_name == '.': - raise ValueError(f"{self!r} has an empty name") - return self.with_segments(parent, name) + return self.with_segments(dirname(str(self)), name) def with_stem(self, stem): """Return a new path with the stem changed.""" @@ -480,7 +394,7 @@ def is_absolute(self): def is_reserved(self): """Return True if the path contains one of the special names reserved by the system, if any.""" - if self.pathmod is posixpath or not self._tail: + if self.pathmod is posixpath or not self.name: return False # NOTE: the rules for reserved names seem somewhat complicated @@ -490,7 +404,7 @@ def is_reserved(self): if self.drive.startswith('\\\\'): # UNC paths are never reserved. return False - name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ') + name = self.name.partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES def match(self, path_pattern, *, case_sensitive=None): @@ -503,9 +417,9 @@ def match(self, path_pattern, *, case_sensitive=None): case_sensitive = _is_case_sensitive(self.pathmod) sep = path_pattern.pathmod.sep pattern_str = str(path_pattern) - if path_pattern.drive or path_pattern.root: + if path_pattern.anchor: pass - elif path_pattern._tail: + elif path_pattern.parts: pattern_str = f'**{sep}{pattern_str}' else: raise ValueError("empty pattern") @@ -780,8 +694,10 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _make_child_entry(self, entry): + def _make_child_entry(self, entry, is_dir=False): # Transform an entry yielded from _scandir() into a path object. + if is_dir: + return entry.joinpath('') return entry def _make_child_relpath(self, name): @@ -792,13 +708,13 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): kind, including directories) matching the given relative pattern. """ path_pattern = self.with_segments(pattern) - if path_pattern.drive or path_pattern.root: + if path_pattern.anchor: raise NotImplementedError("Non-relative patterns are unsupported") - elif not path_pattern._tail: + elif not path_pattern.parts: raise ValueError("Unacceptable pattern: {!r}".format(pattern)) - pattern_parts = path_pattern._tail.copy() - if pattern[-1] in (self.pathmod.sep, self.pathmod.altsep): + pattern_parts = list(path_pattern.parts) + if not self.pathmod.basename(pattern): # GH-65238: pathlib doesn't preserve trailing slash. Add it back. pattern_parts.append('') @@ -816,7 +732,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): filter_paths = follow_symlinks is not None and '..' not in pattern_parts deduplicate_paths = False sep = self.pathmod.sep - paths = iter([self] if self.is_dir() else []) + paths = iter([self.joinpath('')] if self.is_dir() else []) part_idx = 0 while part_idx < len(pattern_parts): part = pattern_parts[part_idx] diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 6e42122212632d..04e6280509ecfc 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -140,6 +140,8 @@ def test_empty_path(self): # The empty path points to '.' p = self.cls('') self.assertEqual(str(p), '.') + # Special case for the empty path. + self._check_str('.', ('',)) def test_parts_interning(self): P = self.cls @@ -300,6 +302,40 @@ def test_repr_roundtrips(self): self.assertEqual(q, p) self.assertEqual(repr(q), r) + def test_name_empty(self): + P = self.cls + self.assertEqual(P('').name, '') + self.assertEqual(P('.').name, '') + self.assertEqual(P('/a/b/.').name, 'b') + + def test_stem_empty(self): + P = self.cls + self.assertEqual(P('').stem, '') + self.assertEqual(P('.').stem, '') + + def test_with_name_empty(self): + P = self.cls + self.assertRaises(ValueError, P('').with_name, 'd.xml') + self.assertRaises(ValueError, P('.').with_name, 'd.xml') + self.assertRaises(ValueError, P('/').with_name, 'd.xml') + self.assertRaises(ValueError, P('a/b').with_name, '') + self.assertRaises(ValueError, P('a/b').with_name, '.') + + def test_with_stem_empty(self): + P = self.cls + self.assertRaises(ValueError, P('').with_stem, 'd') + self.assertRaises(ValueError, P('.').with_stem, 'd') + self.assertRaises(ValueError, P('/').with_stem, 'd') + self.assertRaises(ValueError, P('a/b').with_stem, '') + self.assertRaises(ValueError, P('a/b').with_stem, '.') + + def test_with_suffix_empty(self): + # Path doesn't have a "filename" component. + P = self.cls + self.assertRaises(ValueError, P('').with_suffix, '.gz') + self.assertRaises(ValueError, P('.').with_suffix, '.gz') + self.assertRaises(ValueError, P('/').with_suffix, '.gz') + def test_relative_to_several_args(self): P = self.cls p = P('a/b') @@ -313,6 +349,11 @@ def test_is_relative_to_several_args(self): with self.assertWarns(DeprecationWarning): p.is_relative_to('a', 'b') + def test_match_empty(self): + P = self.cls + self.assertRaises(ValueError, P('a').match, '') + self.assertRaises(ValueError, P('a').match, '.') + class PurePosixPathTest(PurePathTest): cls = pathlib.PurePosixPath diff --git a/Lib/test/test_pathlib/test_pathlib_abc.py b/Lib/test/test_pathlib/test_pathlib_abc.py index b088be87141729..7c8a0f4687f00c 100644 --- a/Lib/test/test_pathlib/test_pathlib_abc.py +++ b/Lib/test/test_pathlib/test_pathlib_abc.py @@ -153,8 +153,6 @@ def test_str_common(self): # Canonicalized paths roundtrip. for pathstr in ('a', 'a/b', 'a/b/c', '/', '/a/b', '/a/b/c'): self._check_str(pathstr, (pathstr,)) - # Special case for the empty path. - self._check_str('.', ('',)) # Other tests for str() are in test_equivalences(). def test_as_posix_common(self): @@ -166,7 +164,6 @@ def test_as_posix_common(self): def test_match_empty(self): P = self.cls self.assertRaises(ValueError, P('a').match, '') - self.assertRaises(ValueError, P('a').match, '.') def test_match_common(self): P = self.cls @@ -206,7 +203,6 @@ def test_match_common(self): self.assertTrue(P('a/b/c.py').match('**')) self.assertTrue(P('/a/b/c.py').match('**')) self.assertTrue(P('/a/b/c.py').match('/**')) - self.assertTrue(P('/a/b/c.py').match('**/')) self.assertTrue(P('/a/b/c.py').match('/a/**')) self.assertTrue(P('/a/b/c.py').match('**/*.py')) self.assertTrue(P('/a/b/c.py').match('/**/*.py')) @@ -270,17 +266,17 @@ def test_parents_common(self): self.assertEqual(len(par), 3) self.assertEqual(par[0], P('a/b')) self.assertEqual(par[1], P('a')) - self.assertEqual(par[2], P('.')) - self.assertEqual(par[-1], P('.')) + self.assertEqual(par[2], P('')) + self.assertEqual(par[-1], P('')) self.assertEqual(par[-2], P('a')) self.assertEqual(par[-3], P('a/b')) self.assertEqual(par[0:1], (P('a/b'),)) self.assertEqual(par[:2], (P('a/b'), P('a'))) self.assertEqual(par[:-1], (P('a/b'), P('a'))) - self.assertEqual(par[1:], (P('a'), P('.'))) - self.assertEqual(par[::2], (P('a/b'), P('.'))) - self.assertEqual(par[::-1], (P('.'), P('a'), P('a/b'))) - self.assertEqual(list(par), [P('a/b'), P('a'), P('.')]) + self.assertEqual(par[1:], (P('a'), P(''))) + self.assertEqual(par[::2], (P('a/b'), P(''))) + self.assertEqual(par[::-1], (P(''), P('a'), P('a/b'))) + self.assertEqual(list(par), [P('a/b'), P('a'), P('')]) with self.assertRaises(IndexError): par[-4] with self.assertRaises(IndexError): @@ -334,8 +330,8 @@ def test_anchor_common(self): def test_name_empty(self): P = self.cls self.assertEqual(P('').name, '') - self.assertEqual(P('.').name, '') - self.assertEqual(P('/a/b/.').name, 'b') + self.assertEqual(P('.').name, '.') + self.assertEqual(P('/a/b/.').name, '.') def test_name_common(self): P = self.cls @@ -387,7 +383,7 @@ def test_suffixes_common(self): def test_stem_empty(self): P = self.cls self.assertEqual(P('').stem, '') - self.assertEqual(P('.').stem, '') + self.assertEqual(P('.').stem, '.') def test_stem_common(self): P = self.cls @@ -412,11 +408,11 @@ def test_with_name_common(self): def test_with_name_empty(self): P = self.cls - self.assertRaises(ValueError, P('').with_name, 'd.xml') - self.assertRaises(ValueError, P('.').with_name, 'd.xml') - self.assertRaises(ValueError, P('/').with_name, 'd.xml') - self.assertRaises(ValueError, P('a/b').with_name, '') - self.assertRaises(ValueError, P('a/b').with_name, '.') + self.assertEqual(P('').with_name('d.xml'), P('d.xml')) + self.assertEqual(P('.').with_name('d.xml'), P('d.xml')) + self.assertEqual(P('/').with_name('d.xml'), P('/d.xml')) + self.assertEqual(P('a/b').with_name(''), P('a/')) + self.assertEqual(P('a/b').with_name('.'), P('a/.')) def test_with_name_seps(self): P = self.cls @@ -436,11 +432,11 @@ def test_with_stem_common(self): def test_with_stem_empty(self): P = self.cls - self.assertRaises(ValueError, P('').with_stem, 'd') - self.assertRaises(ValueError, P('.').with_stem, 'd') - self.assertRaises(ValueError, P('/').with_stem, 'd') - self.assertRaises(ValueError, P('a/b').with_stem, '') - self.assertRaises(ValueError, P('a/b').with_stem, '.') + self.assertEqual(P('').with_stem('d'), P('d')) + self.assertEqual(P('.').with_stem('d'), P('d')) + self.assertEqual(P('/').with_stem('d'), P('/d')) + self.assertEqual(P('a/b').with_stem(''), P('a/')) + self.assertEqual(P('a/b').with_stem('.'), P('a/.')) def test_with_stem_seps(self): P = self.cls @@ -461,9 +457,9 @@ def test_with_suffix_common(self): def test_with_suffix_empty(self): P = self.cls # Path doesn't have a "filename" component. - self.assertRaises(ValueError, P('').with_suffix, '.gz') - self.assertRaises(ValueError, P('.').with_suffix, '.gz') - self.assertRaises(ValueError, P('/').with_suffix, '.gz') + self.assertEqual(P('').with_suffix('.gz'), P('.gz')) + self.assertEqual(P('.').with_suffix('.gz'), P('..gz')) + self.assertEqual(P('/').with_suffix('.gz'), P('/.gz')) def test_with_suffix_seps(self): P = self.cls