From d5bdfdd17342aa17f9bb5ade9a57110c6a83f918 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Nov 2018 16:40:47 +0100 Subject: [PATCH 1/3] resolving url to workspaces, distinguish src and dst directory --- ocrd/resolver.py | 63 ++++++++++++++++----------- ocrd/validator/workspace_validator.py | 21 +++++---- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/ocrd/resolver.py b/ocrd/resolver.py index 8104e9bce..2545c204a 100644 --- a/ocrd/resolver.py +++ b/ocrd/resolver.py @@ -73,34 +73,47 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False, return outfilename - def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_basename=None, download=False, download_local=False): + def workspace_from_url(self, mets_url, src_dir=None, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, download_local=False): """ Create a workspace from a METS by URL. Sets the mets.xml file + + Arguments: + mets_url (string): Source mets URL + src_dir (string, None): Source directory containing the mets.xml + dst_dir (string, None): Target directory for the workspace + clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception. + download (boolean, False): Whether to download all the files + download_local (boolean, False): Whether to download the file://-URL to the new location + + Returns: + Workspace """ - if directory is not None and not directory.startswith('/'): - directory = abspath(directory) + if src_dir is not None and not src_dir.startswith('/'): + src_dir = abspath(src_dir) + if dst_dir is not None and not dst_dir.startswith('/'): + dst_dir = abspath(dst_dir) if mets_url is None: - if directory is None: - raise Exception("Must pass mets_url and/or directory to workspace_from_url") + if src_dir is None: + raise Exception("Must pass mets_url and/or src_dir to workspace_from_url") else: - mets_url = 'file://%s/%s' % (directory, mets_basename) - if mets_url.find('://') == -1: - # resolve to absolute - mets_url = abspath(mets_url) - mets_url = 'file://' + mets_url - if directory is None: - # if mets_url is a file-url assume working directory to be where - # the mets.xml resides + mets_url = 'file://%s/%s' % (src_dir, mets_basename) + + # resolve to absolute + if '://' not in mets_url: + mets_url = 'file://%s' % abspath(mets_url) + + if dst_dir is None: + # if mets_url is a file-url assume working directory is source directory if mets_url.startswith('file://'): - # if directory was not given and mets_url is a file assume that - # directory should be the directory where the mets.xml resides - directory = dirname(mets_url[len('file://'):]) + # if dst_dir was not given and mets_url is a file assume that + # dst_dir should be the directory where the mets.xml resides + dst_dir = dirname(mets_url[len('file://'):]) else: - directory = tempfile.mkdtemp(prefix=TMP_PREFIX) - log.debug("Creating workspace '%s' for METS @ <%s>", directory, mets_url) + dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX) + log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir, mets_url) # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: @@ -109,17 +122,17 @@ def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_ .split('?')[0] \ .split('#')[0] - mets_fpath = join(directory, mets_basename) - log.debug("Copying mets url '%s' to '%s'", mets_url, mets_fpath) - if 'file://' + mets_fpath == mets_url: + dst_mets = join(dst_dir, mets_basename) + log.debug("Copying mets url '%s' to '%s'", mets_url, dst_mets) + if 'file://' + dst_mets == mets_url: log.debug("Target and source mets are identical") else: - if exists(mets_fpath) and not clobber_mets: - raise Exception("File '%s' already exists but clobber_mets is false" % mets_fpath) + if exists(dst_mets) and not clobber_mets: + raise Exception("File '%s' already exists but clobber_mets is false" % dst_mets) else: - self.download_to_directory(directory, mets_url, basename=mets_basename) + self.download_to_directory(dst_dir, mets_url, basename=mets_basename) - workspace = Workspace(self, directory, mets_basename=mets_basename) + workspace = Workspace(self, dst_dir, mets_basename=mets_basename) if download_local or download: for file_grp in workspace.mets.file_groups: diff --git a/ocrd/validator/workspace_validator.py b/ocrd/validator/workspace_validator.py index b92890dbd..57aec2667 100644 --- a/ocrd/validator/workspace_validator.py +++ b/ocrd/validator/workspace_validator.py @@ -1,8 +1,8 @@ import re from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX -from .report import ValidationReport from ocrd.utils import getLogger +from .report import ValidationReport log = getLogger('ocrd.workspace_validator') @@ -19,25 +19,30 @@ class WorkspaceValidator(object): mets_url (string) : URL of the METS file """ - def __init__(self, resolver, mets_url, directory=None): + def __init__(self, resolver, mets_url, src_dir=None): self.resolver = resolver self.mets_url = mets_url self.report = ValidationReport() - log.debug('resolver=%s mets_url=%s directory=%s', resolver, mets_url, directory) - if mets_url is None: - mets_url = '%s/mets.xml' % directory - self.workspace = self.resolver.workspace_from_url(mets_url, directory=directory) + log.debug('resolver=%s mets_url=%s src_dir=%s', resolver, mets_url, src_dir) + if mets_url is None and src_dir is not None: + mets_url = '%s/mets.xml' % src_dir + self.workspace = self.resolver.workspace_from_url(mets_url, src_dir=src_dir) self.mets = self.workspace.mets @staticmethod - def validate_url(resolver, mets_url, directory=None): + def validate_url(resolver, mets_url, src_dir=None): """ Validates the workspace of a METS URL against the specs + Arguments: + resolver (:class:`ocrd.Resolver`): Resolver + mets_url (string): URL of the METS file + src_dir (string, None): Directory containing mets file + Returns: report (:class:`ValidationReport`) Report on the validity """ - validator = WorkspaceValidator(resolver, mets_url, directory=directory) + validator = WorkspaceValidator(resolver, mets_url, src_dir=src_dir) return validator.validate() def validate(self): From 81eb7c13a924fe008da45ccdb87a77957b229ed6 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Nov 2018 17:31:47 +0100 Subject: [PATCH 2/3] when creating workspace from mets, save the src env to pass on for resolving relative files --- CHANGELOG.md | 4 ++++ ocrd/resolver.py | 24 ++++++++++++++++++------ ocrd/workspace.py | 7 +++++-- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ebd4e073..8f1a535aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Fixed: + + * Relative files in workspace were resolved to target, not source dir of METS, #215 + ## [0.13.0] - 2018-11-23 Changed: diff --git a/ocrd/resolver.py b/ocrd/resolver.py index 2545c204a..4e6ffa382 100644 --- a/ocrd/resolver.py +++ b/ocrd/resolver.py @@ -17,7 +17,7 @@ class Resolver(object): Handle Uploads, Downloads, Repository access and manage temporary directories """ - def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None): + def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None, src_dir=''): """ Download a file to the workspace. @@ -32,6 +32,7 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False, url (string): URL to download from overwrite (boolean): Whether to overwrite existing files with that name subdir (string, None): Subdirectory to create within the directory. Think fileGrp. + src_dir (string, ''): Directory for resolving relative file names Returns: Local filename @@ -59,11 +60,19 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False, if not isdir(outfiledir): makedirs(outfiledir) - log.debug("Downloading <%s> to '%s'", url, outfilename) - if isfile(url): + log.debug("Downloading <%s> to '%s' (src_dir=%s)", url, outfilename, src_dir) + + # de-scheme file:// URL + if url.startswith('file://'): + url = url[len('file://'):] + + # Relativize against src_dir + if isfile(join(src_dir, url)): + url = join(src_dir, url) + + # Copy files or download remote assets + if '://' not in url: copyfile(url, outfilename) - elif url.startswith('file://'): - copyfile(url[len('file://'):], outfilename) else: response = requests.get(url) if response.status_code != 200: @@ -115,6 +124,9 @@ def workspace_from_url(self, mets_url, src_dir=None, dst_dir=None, clobber_mets= dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX) log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir, mets_url) + if src_dir is None: + src_dir = dirname(mets_url[len('file://'):]) + # if mets_basename is not given, use the last URL segment of the mets_url if mets_basename is None: mets_basename = mets_url \ @@ -132,7 +144,7 @@ def workspace_from_url(self, mets_url, src_dir=None, dst_dir=None, clobber_mets= else: self.download_to_directory(dst_dir, mets_url, basename=mets_basename) - workspace = Workspace(self, dst_dir, mets_basename=mets_basename) + workspace = Workspace(self, dst_dir, mets_basename=mets_basename, src_dir=src_dir) if download_local or download: for file_grp in workspace.mets.file_groups: diff --git a/ocrd/workspace.py b/ocrd/workspace.py index faf8617a2..f3104153f 100644 --- a/ocrd/workspace.py +++ b/ocrd/workspace.py @@ -1,4 +1,5 @@ import os +from os.path import dirname import sys import shutil @@ -23,9 +24,10 @@ class Workspace(object): directory (string) : Folder to work in mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``. mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url. + src_dir (string) : Directory containing the source mets.xml, to resolve relative file URL. """ - def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False): + def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, src_dir=''): self.resolver = resolver self.directory = directory self.mets_target = os.path.join(directory, mets_basename) @@ -33,6 +35,7 @@ def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', aut mets = OcrdMets(filename=self.mets_target) self.mets = mets self.automatic_backup = automatic_backup + self.src_dir = src_dir # print(mets.to_xml(xmllint=True).decode('utf-8')) self.image_cache = { 'pil': {}, @@ -65,7 +68,7 @@ def download_url(self, url, **kwargs): The local filename of the downloaded file """ os.chdir(self.directory) - return self.resolver.download_to_directory(self.directory, url, **kwargs) + return self.resolver.download_to_directory(self.directory, url, src_dir=self.src_dir, **kwargs) def download_file(self, f, **kwargs): """ From 1c84a4beed8439cc884f7eda2761045284443b41 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 26 Nov 2018 18:16:48 +0100 Subject: [PATCH 3/3] fix validate cli --- ocrd/cli/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/cli/workspace.py b/ocrd/cli/workspace.py index ac94384eb..53dd76157 100644 --- a/ocrd/cli/workspace.py +++ b/ocrd/cli/workspace.py @@ -45,7 +45,7 @@ def workspace_cli(ctx, directory, mets_basename, backup): @pass_workspace @click.argument('mets_url') def validate_workspace(ctx, mets_url=None): - report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, directory=ctx.directory) + report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, src_dir=ctx.directory) print(report.to_xml()) if not report.is_valid: sys.exit(128)