Skip to content

Commit

Permalink
Merge pull request #215 from kba/directory-src-dst
Browse files Browse the repository at this point in the history
Save source dir of METS for resolving relative files in workspace
  • Loading branch information
kba authored Nov 26, 2018
2 parents 8c9b734 + 1c84a4b commit 4c9f260
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 41 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Fixed:

* Relative files in workspace were resolved to target, not source dir of METS, #215

## [0.13.0] - 2018-11-23

Changed:
Expand Down
2 changes: 1 addition & 1 deletion ocrd/cli/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def workspace_cli(ctx, directory, mets_basename, backup):
@pass_workspace
@click.argument('mets_url')
def validate_workspace(ctx, mets_url=None):
report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, directory=ctx.directory)
report = WorkspaceValidator.validate_url(ctx.resolver, mets_url, src_dir=ctx.directory)
print(report.to_xml())
if not report.is_valid:
sys.exit(128)
Expand Down
85 changes: 55 additions & 30 deletions ocrd/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class Resolver(object):
Handle Uploads, Downloads, Repository access and manage temporary directories
"""

def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None):
def download_to_directory(self, directory, url, basename=None, overwrite=False, subdir=None, src_dir=''):
"""
Download a file to the workspace.
Expand All @@ -32,6 +32,7 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False,
url (string): URL to download from
overwrite (boolean): Whether to overwrite existing files with that name
subdir (string, None): Subdirectory to create within the directory. Think fileGrp.
src_dir (string, ''): Directory for resolving relative file names
Returns:
Local filename
Expand Down Expand Up @@ -59,11 +60,19 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False,
if not isdir(outfiledir):
makedirs(outfiledir)

log.debug("Downloading <%s> to '%s'", url, outfilename)
if isfile(url):
log.debug("Downloading <%s> to '%s' (src_dir=%s)", url, outfilename, src_dir)

# de-scheme file:// URL
if url.startswith('file://'):
url = url[len('file://'):]

# Relativize against src_dir
if isfile(join(src_dir, url)):
url = join(src_dir, url)

# Copy files or download remote assets
if '://' not in url:
copyfile(url, outfilename)
elif url.startswith('file://'):
copyfile(url[len('file://'):], outfilename)
else:
response = requests.get(url)
if response.status_code != 200:
Expand All @@ -73,34 +82,50 @@ def download_to_directory(self, directory, url, basename=None, overwrite=False,

return outfilename

def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_basename=None, download=False, download_local=False):
def workspace_from_url(self, mets_url, src_dir=None, dst_dir=None, clobber_mets=False, mets_basename=None, download=False, download_local=False):
"""
Create a workspace from a METS by URL.
Sets the mets.xml file
Arguments:
mets_url (string): Source mets URL
src_dir (string, None): Source directory containing the mets.xml
dst_dir (string, None): Target directory for the workspace
clobber_mets (boolean, False): Whether to overwrite existing mets.xml. By default existing mets.xml will raise an exception.
download (boolean, False): Whether to download all the files
download_local (boolean, False): Whether to download the file://-URL to the new location
Returns:
Workspace
"""
if directory is not None and not directory.startswith('/'):
directory = abspath(directory)
if src_dir is not None and not src_dir.startswith('/'):
src_dir = abspath(src_dir)
if dst_dir is not None and not dst_dir.startswith('/'):
dst_dir = abspath(dst_dir)

if mets_url is None:
if directory is None:
raise Exception("Must pass mets_url and/or directory to workspace_from_url")
if src_dir is None:
raise Exception("Must pass mets_url and/or src_dir to workspace_from_url")
else:
mets_url = 'file://%s/%s' % (directory, mets_basename)
if mets_url.find('://') == -1:
# resolve to absolute
mets_url = abspath(mets_url)
mets_url = 'file://' + mets_url
if directory is None:
# if mets_url is a file-url assume working directory to be where
# the mets.xml resides
mets_url = 'file://%s/%s' % (src_dir, mets_basename)

# resolve to absolute
if '://' not in mets_url:
mets_url = 'file://%s' % abspath(mets_url)

if dst_dir is None:
# if mets_url is a file-url assume working directory is source directory
if mets_url.startswith('file://'):
# if directory was not given and mets_url is a file assume that
# directory should be the directory where the mets.xml resides
directory = dirname(mets_url[len('file://'):])
# if dst_dir was not given and mets_url is a file assume that
# dst_dir should be the directory where the mets.xml resides
dst_dir = dirname(mets_url[len('file://'):])
else:
directory = tempfile.mkdtemp(prefix=TMP_PREFIX)
log.debug("Creating workspace '%s' for METS @ <%s>", directory, mets_url)
dst_dir = tempfile.mkdtemp(prefix=TMP_PREFIX)
log.debug("Creating workspace '%s' for METS @ <%s>", dst_dir, mets_url)

if src_dir is None:
src_dir = dirname(mets_url[len('file://'):])

# if mets_basename is not given, use the last URL segment of the mets_url
if mets_basename is None:
Expand All @@ -109,17 +134,17 @@ def workspace_from_url(self, mets_url, directory=None, clobber_mets=False, mets_
.split('?')[0] \
.split('#')[0]

mets_fpath = join(directory, mets_basename)
log.debug("Copying mets url '%s' to '%s'", mets_url, mets_fpath)
if 'file://' + mets_fpath == mets_url:
dst_mets = join(dst_dir, mets_basename)
log.debug("Copying mets url '%s' to '%s'", mets_url, dst_mets)
if 'file://' + dst_mets == mets_url:
log.debug("Target and source mets are identical")
else:
if exists(mets_fpath) and not clobber_mets:
raise Exception("File '%s' already exists but clobber_mets is false" % mets_fpath)
if exists(dst_mets) and not clobber_mets:
raise Exception("File '%s' already exists but clobber_mets is false" % dst_mets)
else:
self.download_to_directory(directory, mets_url, basename=mets_basename)
self.download_to_directory(dst_dir, mets_url, basename=mets_basename)

workspace = Workspace(self, directory, mets_basename=mets_basename)
workspace = Workspace(self, dst_dir, mets_basename=mets_basename, src_dir=src_dir)

if download_local or download:
for file_grp in workspace.mets.file_groups:
Expand Down
21 changes: 13 additions & 8 deletions ocrd/validator/workspace_validator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re

from ocrd.constants import FILE_GROUP_CATEGORIES, FILE_GROUP_PREFIX
from .report import ValidationReport
from ocrd.utils import getLogger
from .report import ValidationReport

log = getLogger('ocrd.workspace_validator')

Expand All @@ -19,25 +19,30 @@ class WorkspaceValidator(object):
mets_url (string) : URL of the METS file
"""

def __init__(self, resolver, mets_url, directory=None):
def __init__(self, resolver, mets_url, src_dir=None):
self.resolver = resolver
self.mets_url = mets_url
self.report = ValidationReport()
log.debug('resolver=%s mets_url=%s directory=%s', resolver, mets_url, directory)
if mets_url is None:
mets_url = '%s/mets.xml' % directory
self.workspace = self.resolver.workspace_from_url(mets_url, directory=directory)
log.debug('resolver=%s mets_url=%s src_dir=%s', resolver, mets_url, src_dir)
if mets_url is None and src_dir is not None:
mets_url = '%s/mets.xml' % src_dir
self.workspace = self.resolver.workspace_from_url(mets_url, src_dir=src_dir)
self.mets = self.workspace.mets

@staticmethod
def validate_url(resolver, mets_url, directory=None):
def validate_url(resolver, mets_url, src_dir=None):
"""
Validates the workspace of a METS URL against the specs
Arguments:
resolver (:class:`ocrd.Resolver`): Resolver
mets_url (string): URL of the METS file
src_dir (string, None): Directory containing mets file
Returns:
report (:class:`ValidationReport`) Report on the validity
"""
validator = WorkspaceValidator(resolver, mets_url, directory=directory)
validator = WorkspaceValidator(resolver, mets_url, src_dir=src_dir)
return validator.validate()

def validate(self):
Expand Down
7 changes: 5 additions & 2 deletions ocrd/workspace.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from os.path import dirname
import sys
import shutil

Expand All @@ -23,16 +24,18 @@ class Workspace(object):
directory (string) : Folder to work in
mets (:class:`OcrdMets`) : OcrdMets representing this workspace. Loaded from 'mets.xml' if ``None``.
mets_basename (string) : Basename of the METS XML file. Default: Last URL segment of the mets_url.
src_dir (string) : Directory containing the source mets.xml, to resolve relative file URL.
"""

def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False):
def __init__(self, resolver, directory, mets=None, mets_basename='mets.xml', automatic_backup=False, src_dir=''):
self.resolver = resolver
self.directory = directory
self.mets_target = os.path.join(directory, mets_basename)
if mets is None:
mets = OcrdMets(filename=self.mets_target)
self.mets = mets
self.automatic_backup = automatic_backup
self.src_dir = src_dir
# print(mets.to_xml(xmllint=True).decode('utf-8'))
self.image_cache = {
'pil': {},
Expand Down Expand Up @@ -65,7 +68,7 @@ def download_url(self, url, **kwargs):
The local filename of the downloaded file
"""
os.chdir(self.directory)
return self.resolver.download_to_directory(self.directory, url, **kwargs)
return self.resolver.download_to_directory(self.directory, url, src_dir=self.src_dir, **kwargs)

def download_file(self, f, **kwargs):
"""
Expand Down

0 comments on commit 4c9f260

Please sign in to comment.