diff --git a/.pylintrc b/.pylintrc
index b2125d824..2e3af4288 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,19 +1,22 @@
[MASTER]
-extension-pkg-whitelist=lxml
-ignored-modules=cv2,tesserocr,ocrd.model
+extension-pkg-whitelist=lxml,pydantic
+ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds
+ignore-paths=ocrd_page_generateds.py
+ignore-patterns=.*generateds.*
[MESSAGES CONTROL]
-ignore-patterns='.*generateds.*'
disable =
fixme,
- E501,
+ line-too-long,
+ consider-using-f-string,
+ logging-fstring-interpolation,
trailing-whitespace,
logging-not-lazy,
inconsistent-return-statements,
+ disallowed-name,
invalid-name,
line-too-long,
missing-docstring,
- no-self-use,
wrong-import-order,
too-many-nested-blocks,
superfluous-parens,
@@ -25,13 +28,9 @@ disable =
ungrouped-imports,
useless-object-inheritance,
useless-import-alias,
- bad-continuation,
no-else-return,
logging-not-lazy
-[FORMAT]
-no-space-check=empty-line
-
[DESIGN]
# Maximum number of arguments for function / method
max-args=12
@@ -40,7 +39,7 @@ max-locals=30
# Maximum number of return / yield for function / method body
max-returns=12
# Maximum number of branch for function / method body
-max-branchs=30
+max-branches=30
# Maximum number of statements in function / method body
max-statements=60
# Maximum number of parents for a class (see R0901).
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 351f5a56a..d058ebce9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,36 @@ Versioned according to [Semantic Versioning](http://semver.org/).
## Unreleased
+## [2.69.0] - 2024-09-30
+
+Fixed:
+ - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally
+ - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup`
+ - `ocrd.cli.workspace`: make `list-page` work w/ METS Server
+ - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url`
+ - `lib.bash`: fix `errexit` handling
+ - actually apply CLI `--log-filename`, and show in `--help`
+ - adapt to Pillow changes
+ - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering)
+ - `OcrdMetsServer.add_file`: pass on `force` kwarg
+ - `Workspace.reload_mets`: handle ClientSideOcrdMets as well
+ - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds`
+ - `disableLogging`: also re-instate root logger to Python defaults
+
+Changed:
+ - `run_processor`: be robust if `ocrd_tool` is missing `steps`
+ - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_`
+ - `ClientSideOcrdMets`: use same logger name prefix as METS Server
+ - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise
+
+Added:
+ - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict
+ - METS Server: export and delegate `physical_pages`
+ - ocrd.cli.workspace `server`: add subcommands `reload` and `save`
+ - processor CLI: delegate `--resolve-resource`, too
+ - `OcrdConfig.reset_defaults` to reset config variables to their defaults
+ - `ocrd_utils.scale_coordinates` for resizing images
+
## [2.68.0] - 2024-08-23
Changed:
@@ -2164,6 +2194,7 @@ Fixed
Initial Release
+[2.69.0]: ../../compare/v2.69.0..v2.68.0
[2.68.0]: ../../compare/v2.68.0..v2.67.2
[2.67.2]: ../../compare/v2.67.2..v2.67.1
[2.67.1]: ../../compare/v2.67.1..v2.67.0
diff --git a/Makefile b/Makefile
index 4997066d1..b5cd2f276 100644
--- a/Makefile
+++ b/Makefile
@@ -273,7 +273,7 @@ test-logging: assets
cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \
cd $$tempdir; \
$(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \
- rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \
+ rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \
rm -rf $$tempdir/.coverage; \
rmdir $$tempdir
diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py
index 2a7fa99ec..3c024ec66 100644
--- a/src/ocrd/cli/ocrd_tool.py
+++ b/src/ocrd/cli/ocrd_tool.py
@@ -29,6 +29,8 @@ def __init__(self, filename):
self.filename = filename
with codecs.open(filename, encoding='utf-8') as f:
self.content = f.read()
+ # perhaps the validator should _always_ run (for default expansion)
+ # so validate command only for the report?
self.json = loads(self.content)
pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx)
diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py
index b26803d05..9d0cafd06 100644
--- a/src/ocrd/cli/validate.py
+++ b/src/ocrd/cli/validate.py
@@ -102,16 +102,19 @@ def validate_page(page, **kwargs):
@validate_cli.command('tasks')
@click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax')
@click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace')
+@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server')
@click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.')
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.argument('tasks', nargs=-1, required=True)
-def validate_process(tasks, workspace, mets_basename, overwrite, page_id):
+def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id):
'''
Validate a sequence of tasks passable to 'ocrd process'
'''
if workspace:
- _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks],
- Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite))
+ _inform_of_result(validate_tasks(
+ [ProcessorTask.parse(t) for t in tasks],
+ Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url),
+ page_id=page_id, overwrite=overwrite))
else:
for t in [ProcessorTask.parse(t) for t in tasks]:
_inform_of_result(t.validate())
diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py
index 0c70fd3a3..f66a1e336 100644
--- a/src/ocrd/cli/workspace.py
+++ b/src/ocrd/cli/workspace.py
@@ -37,6 +37,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met
= self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url)
self.automatic_backup = automatic_backup
+ def workspace(self):
+ return Workspace(
+ self.resolver,
+ directory=self.directory,
+ mets_basename=self.mets_basename,
+ automatic_backup=self.automatic_backup,
+ mets_server_url=self.mets_server_url,
+ )
+ def backup_manager(self):
+ return WorkspaceBackupManager(self.workspace())
+
pass_workspace = click.make_pass_decorator(WorkspaceCtx)
@@ -118,7 +129,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency
@workspace_cli.command('clone', cls=command_with_replaced_help(
(r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument
@click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True)
-@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning")
+@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards")
@click.argument('mets_url')
@mets_find_options
# XXX deprecated
@@ -129,20 +140,25 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim
Create a workspace from METS_URL and return the directory
METS_URL can be a URL, an absolute path or a path relative to $PWD.
- If METS_URL is not provided, use --mets accordingly.
METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file.
+
+ Additional options pertain to the selection of files / fileGrps / pages
+ to be downloaded, if --download is used.
"""
LOG = getLogger('ocrd.cli.workspace.clone')
if workspace_dir:
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir))
ctx.directory = workspace_dir
+ assert not ctx.mets_server_url, \
+ f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
workspace = ctx.resolver.workspace_from_url(
mets_url,
dst_dir=ctx.directory,
mets_basename=ctx.mets_basename,
clobber_mets=clobber_mets,
download=download,
+ fileGrp=file_grp,
ID=file_id,
pageId=page_id,
mimetype=mimetype,
@@ -171,10 +187,12 @@ def workspace_init(ctx, clobber_mets, directory):
if directory:
LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory))
ctx.directory = directory
+ assert not ctx.mets_server_url, \
+ f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
workspace = ctx.resolver.workspace_from_nothing(
directory=ctx.directory,
mets_basename=ctx.mets_basename,
- clobber_mets=clobber_mets
+ clobber_mets=clobber_mets,
)
workspace.save_mets()
print(workspace.directory)
@@ -198,13 +216,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_
Add a file or http(s) URL FNAME to METS in a workspace.
If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
"""
- workspace = Workspace(
- ctx.resolver,
- directory=ctx.directory,
- mets_basename=ctx.mets_basename,
- automatic_backup=ctx.automatic_backup,
- mets_server_url=ctx.mets_server_url,
- )
+ workspace = ctx.workspace()
log = getLogger('ocrd.cli.workspace.add')
if not mimetype:
@@ -310,13 +322,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
-G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' -
"""
log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
- workspace = Workspace(
- ctx.resolver,
- directory=ctx.directory,
- mets_basename=ctx.mets_basename,
- automatic_backup=ctx.automatic_backup,
- mets_server_url=ctx.mets_server_url,
- )
+ workspace = ctx.workspace()
try:
pat = re.compile(regex)
@@ -407,7 +413,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi
if dry_run:
log.info('workspace.add_file(%s)' % file_dict)
else:
- workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)
+ workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg
# save changes to disk
workspace.save_mets()
@@ -451,13 +457,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl
snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"}
output_field = [snake_to_camel.get(x, x) for x in output_field]
modified_mets = False
- ret = list()
- workspace = Workspace(
- ctx.resolver,
- directory=ctx.directory,
- mets_basename=ctx.mets_basename,
- mets_server_url=ctx.mets_server_url,
- )
+ ret = []
+ workspace = ctx.workspace()
with pushd_popd(workspace.directory):
for f in workspace.find_files(
file_id=file_id,
@@ -507,7 +508,9 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin
(If any ``ID`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ assert not ctx.mets_server_url, \
+ f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
for i in id:
workspace.remove_file(i, force=force, keep_file=keep_file)
workspace.save_mets()
@@ -525,7 +528,9 @@ def rename_group(ctx, old, new):
"""
Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
+ assert not ctx.mets_server_url, \
+ f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
workspace.rename_file_group(old, new)
workspace.save_mets()
@@ -546,7 +551,9 @@ def remove_group(ctx, group, recursive, force, keep_files):
(If any ``GROUP`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
+ assert not ctx.mets_server_url, \
+ f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
for g in group:
workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
workspace.save_mets()
@@ -568,7 +575,9 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id):
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ assert not ctx.mets_server_url, \
+ f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
with pushd_popd(workspace.directory):
for f in workspace.find_files(
file_id=file_id,
@@ -605,8 +614,7 @@ def clean(ctx, dry_run, directories, path_glob):
If no PATH_GLOB are specified, then all files and directories
may match.
"""
- log = getLogger('ocrd.cli.workspace.clean')
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ workspace = ctx.workspace()
allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)]
allowed_files.append(relpath(workspace.mets_target, start=workspace.directory))
allowed_dirs = set(dirname(path) for path in allowed_files)
@@ -624,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob):
if normpath(path) in allowed_files:
continue
if dry_run:
- log.info('unlink(%s)' % path)
+ ctx.log.info('unlink(%s)' % path)
else:
unlink(path)
if not directories:
@@ -634,7 +642,7 @@ def clean(ctx, dry_run, directories, path_glob):
if normpath(path) in allowed_dirs:
continue
if dry_run:
- log.info('rmdir(%s)' % path)
+ ctx.log.info('rmdir(%s)' % path)
else:
rmdir(path)
@@ -648,7 +656,7 @@ def list_groups(ctx):
"""
List fileGrp USE attributes
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
+ workspace = ctx.workspace()
print("\n".join(workspace.mets.file_groups))
# ----------------------------------------------------------------------
@@ -674,20 +682,16 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page
(If any ``FILTER`` starts with ``//``, then its remainder
will be interpreted as a regular expression.)
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
- find_kwargs = {}
- if page_id_range and 'ID' in output_field:
- find_kwargs['pageId'] = page_id_range
- page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId})
+ workspace = ctx.workspace()
ret = []
-
- if output_field == ['ID']:
- ret = [[x] for x in page_ids]
- else:
- for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)):
+ if page_id_range or list(output_field) != ['ID']:
+ for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)):
ret.append([])
for k in output_field:
ret[i].append(page_div.get(k, 'None'))
+ else:
+ for page_id in workspace.mets.physical_pages:
+ ret.append([page_id])
if numeric_range:
start, end = map(int, numeric_range.split('..'))
@@ -721,7 +725,7 @@ def get_id(ctx):
"""
Get METS id if any
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename)
+ workspace = ctx.workspace()
ID = workspace.mets.unique_identifier
if ID:
print(ID)
@@ -741,7 +745,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin
Otherwise will create a new {{ ID }}.
"""
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ workspace = ctx.workspace()
workspace.mets.unique_identifier = id
workspace.save_mets()
@@ -764,7 +768,9 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id):
if contentids:
update_kwargs['CONTENTIDS'] = contentids
try:
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ assert not ctx.mets_server_url, \
+ f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
workspace.mets.update_physical_page_attributes(page_id, **update_kwargs)
workspace.save_mets()
except Exception as err:
@@ -802,7 +808,9 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
mets_path = Path(mets_path)
if filegrp_mapping:
filegrp_mapping = loads(filegrp_mapping)
- workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)
+ assert not ctx.mets_server_url, \
+ f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}"
+ workspace = ctx.workspace()
other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name))
workspace.merge(
other_workspace,
@@ -826,11 +834,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa
# ----------------------------------------------------------------------
@workspace_cli.group('backup')
-@click.pass_context
+@pass_workspace
def workspace_backup_cli(ctx): # pylint: disable=unused-argument
"""
Backing and restoring workspaces - dev edition
"""
+ assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server"
@workspace_backup_cli.command('add')
@pass_workspace
@@ -838,7 +847,7 @@ def workspace_backup_add(ctx):
"""
Create a new backup
"""
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
+ backup_manager = ctx.backup_manager()
backup_manager.add()
@workspace_backup_cli.command('list')
@@ -847,7 +856,7 @@ def workspace_backup_list(ctx):
"""
List backups
"""
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
+ backup_manager = ctx.backup_manager()
for b in backup_manager.list():
print(b)
@@ -859,7 +868,7 @@ def workspace_backup_restore(ctx, choose_first, bak):
"""
Restore backup BAK
"""
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
+ backup_manager = ctx.backup_manager()
backup_manager.restore(bak, choose_first)
@workspace_backup_cli.command('undo')
@@ -868,7 +877,7 @@ def workspace_backup_undo(ctx):
"""
Restore the last backup
"""
- backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup))
+ backup_manager = ctx.backup_manager()
backup_manager.undo()
@@ -885,15 +894,24 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument
@workspace_serve_cli.command('stop')
@pass_workspace
def workspace_serve_stop(ctx): # pylint: disable=unused-argument
- """Stop the METS server"""
- workspace = Workspace(
- ctx.resolver,
- directory=ctx.directory,
- mets_basename=ctx.mets_basename,
- mets_server_url=ctx.mets_server_url,
- )
+ """Stop the METS server (saving changes to disk)"""
+ workspace = ctx.workspace()
workspace.mets.stop()
+@workspace_serve_cli.command('reload')
+@pass_workspace
+def workspace_serve_reload(ctx): # pylint: disable=unused-argument
+ """Reload the METS server from disk"""
+ workspace = ctx.workspace()
+ workspace.mets.reload()
+
+@workspace_serve_cli.command('save')
+@pass_workspace
+def workspace_serve_save(ctx): # pylint: disable=unused-argument
+ """Save the METS changes to disk"""
+ workspace = ctx.workspace()
+ workspace.mets.save()
+
@workspace_serve_cli.command('start')
@pass_workspace
def workspace_serve_start(ctx): # pylint: disable=unused-argument
diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py
index 580a75b0c..464bb67ed 100644
--- a/src/ocrd/decorators/__init__.py
+++ b/src/ocrd/decorators/__init__.py
@@ -1,4 +1,5 @@
import sys
+from contextlib import nullcontext
from ocrd_utils import (
config,
@@ -9,6 +10,7 @@
parse_json_string_with_comments,
set_json_key_value_overrides,
parse_json_string_or_file,
+ redirect_stderr_and_stdout_to_file,
)
from ocrd_validators import WorkspaceValidator
from ocrd_network import ProcessingWorker, ProcessorServer, AgentType
@@ -36,6 +38,7 @@ def ocrd_cli_wrap_processor(
profile_file=None,
version=False,
overwrite=False,
+ resolve_resource=None,
show_resource=None,
list_resources=False,
# ocrd_network params start #
@@ -50,7 +53,7 @@ def ocrd_cli_wrap_processor(
if not sys.argv[1:]:
processorClass(None, show_help=True)
sys.exit(1)
- if dump_json or dump_module_dir or help or version or show_resource or list_resources:
+ if dump_json or dump_module_dir or help or version or resolve_resource or show_resource or list_resources:
processorClass(
None,
dump_json=dump_json,
@@ -58,6 +61,7 @@ def ocrd_cli_wrap_processor(
show_help=help,
subcommand=subcommand,
show_version=version,
+ resolve_resource=resolve_resource,
show_resource=show_resource,
list_resources=list_resources
)
@@ -139,7 +143,7 @@ def resolve(name):
print("Profiling...")
pr = cProfile.Profile()
pr.enable()
- def exit():
+ def goexit():
pr.disable()
print("Profiling completed")
if profile_file:
@@ -148,8 +152,13 @@ def exit():
s = io.StringIO()
pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats()
print(s.getvalue())
- atexit.register(exit)
- run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs)
+ atexit.register(goexit)
+ if log_filename:
+ log_ctx = redirect_stderr_and_stdout_to_file(log_filename)
+ else:
+ log_ctx = nullcontext()
+ with log_ctx:
+ run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs)
def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str):
diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py
index e640a2003..9c87034ab 100644
--- a/src/ocrd/decorators/ocrd_cli_options.py
+++ b/src/ocrd/decorators/ocrd_cli_options.py
@@ -41,6 +41,7 @@ def cli(mets_url):
option('--address', type=ServerAddressParamType()),
option('--queue', type=QueueServerParamType()),
option('--database', type=DatabaseParamType()),
+ option('-R', '--resolve-resource'),
option('-C', '--show-resource'),
option('-L', '--list-resources', is_flag=True, default=False),
option('-J', '--dump-json', is_flag=True, default=False),
diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py
index 0fbe3e057..55abbc2a5 100644
--- a/src/ocrd/decorators/parameter_option.py
+++ b/src/ocrd/decorators/parameter_option.py
@@ -1,10 +1,10 @@
from click import option
-#from ocrd_utils import parse_json_string_or_file
__all__ = ['parameter_option', 'parameter_override_option']
def _handle_param_option(ctx, param, value):
+ from ocrd_utils import parse_json_string_or_file
return parse_json_string_or_file(*list(value))
parameter_option = option('-p', '--parameter',
diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash
index 1e3ecfc6e..745bc52fe 100644
--- a/src/ocrd/lib.bash
+++ b/src/ocrd/lib.bash
@@ -27,6 +27,7 @@ ocrd__log () {
## Ensure minimum version
# ht https://stackoverflow.com/posts/4025065
ocrd__minversion () {
+ set -e
local minversion="$1"
local version=$(ocrd --version|sed 's/ocrd, version //')
#echo "$minversion < $version?"
@@ -108,6 +109,7 @@ ocrd__usage () {
## declare -A ocrd__argv=()
## ```
ocrd__parse_argv () {
+ set -e
# if [[ -n "$ZSH_VERSION" ]];then
# print -r -- ${+ocrd__argv} ${(t)ocrd__argv}
@@ -141,6 +143,7 @@ ocrd__parse_argv () {
while [[ "${1:-}" = -* ]];do
case "$1" in
-l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
+ --log-filename) exec 2> "$2" ; shift ;;
-h|--help|--usage) ocrd__usage; exit ;;
-J|--dump-json) ocrd__dumpjson; exit ;;
-D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;;
@@ -249,6 +252,7 @@ $params_parsed"
}
ocrd__wrap () {
+ set -e
declare -gx OCRD_TOOL_JSON="$1"
declare -gx OCRD_TOOL_NAME="$2"
diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py
index 0d4c0a078..c85368e30 100644
--- a/src/ocrd/mets_server.py
+++ b/src/ocrd/mets_server.py
@@ -88,6 +88,14 @@ def create(file_groups: List[str]):
return OcrdFileGroupListModel(file_groups=file_groups)
+class OcrdPageListModel(BaseModel):
+ physical_pages: List[str] = Field()
+
+ @staticmethod
+ def create(physical_pages: List[str]):
+ return OcrdPageListModel(physical_pages=physical_pages)
+
+
class OcrdAgentListModel(BaseModel):
agents: List[OcrdAgentModel] = Field()
@@ -120,7 +128,7 @@ class ClientSideOcrdMets:
def __init__(self, url, workspace_path: Optional[str] = None):
self.protocol = "tcp" if url.startswith("http://") else "uds"
- self.log = getLogger(f"ocrd.mets_client[{url}]")
+ self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}")
self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}'
self.ws_dir_path = workspace_path if workspace_path else None
@@ -210,6 +218,17 @@ def workspace_path(self):
).json()["text"]
return self.ws_dir_path
+ @property
+ def physical_pages(self) -> List[str]:
+ if not self.multiplexing_mode:
+ return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"]
+ else:
+ return self.session.request(
+ "POST",
+ self.url,
+ json=MpxReq.physical_pages(self.ws_dir_path)
+ ).json()["physical_pages"]
+
@property
def file_groups(self):
if not self.multiplexing_mode:
@@ -236,7 +255,7 @@ def agents(self):
agent_dict["_type"] = agent_dict.pop("type")
return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts]
- def add_agent(self, *args, **kwargs):
+ def add_agent(self, **kwargs):
if not self.multiplexing_mode:
return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict())
else:
@@ -247,11 +266,9 @@ def add_agent(self, *args, **kwargs):
).json()
return OcrdAgentModel.create(**kwargs)
- @deprecated_alias(ID="file_id")
- @deprecated_alias(pageId="page_id")
- @deprecated_alias(fileGrp="file_grp")
def find_files(self, **kwargs):
self.log.debug("find_files(%s)", kwargs)
+ # translate from native OcrdMets kwargs to OcrdMetsServer REST params
if "pageId" in kwargs:
kwargs["page_id"] = kwargs.pop("pageId")
if "ID" in kwargs:
@@ -277,28 +294,31 @@ def find_files(self, **kwargs):
def find_all_files(self, *args, **kwargs):
return list(self.find_files(*args, **kwargs))
- @deprecated_alias(pageId="page_id")
- @deprecated_alias(ID="file_id")
def add_file(
- self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs
+ self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs
):
data = OcrdFileModel.create(
- file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url,
- local_filename=local_filename
+ file_grp=file_grp,
+ # translate from native OcrdMets kwargs to OcrdMetsServer REST params
+ file_id=ID, page_id=pageId,
+ mimetype=mimetype, url=url, local_filename=local_filename
)
+ # add force+ignore
+ kwargs = {**kwargs, **data.dict()}
if not self.multiplexing_mode:
- r = self.session.request("POST", f"{self.url}/file", data=data.dict())
- if not r:
- raise RuntimeError("Add file failed. Please check provided parameters")
+ r = self.session.request("POST", f"{self.url}/file", data=kwargs)
+ if not r.ok:
+ raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}")
else:
- r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict()))
- if "error" in r:
- raise RuntimeError(f"Add file failed: Msg: {r['error']}")
+ r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs))
+ if not r.ok:
+ raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}")
return ClientSideOcrdFile(
- None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype,
- local_filename=local_filename
+ None, fileGrp=file_grp,
+ ID=ID, pageId=pageId,
+ url=url, mimetype=mimetype, local_filename=local_filename
)
@@ -348,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict:
return MpxReq.__args_wrapper(
ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={})
+ @staticmethod
+ def physical_pages(ws_dir_path: str) -> Dict:
+ return MpxReq.__args_wrapper(
+ ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={})
+
@staticmethod
def file_groups(ws_dir_path: str) -> Dict:
return MpxReq.__args_wrapper(
@@ -468,6 +493,10 @@ async def unique_identifier():
async def workspace_path():
return Response(content=workspace.directory, media_type="text/plain")
+ @app.get(path='/physical_pages', response_model=OcrdPageListModel)
+ async def physical_pages():
+ return {'physical_pages': workspace.mets.physical_pages}
+
@app.get(path='/file_groups', response_model=OcrdFileGroupListModel)
async def file_groups():
return {'file_groups': workspace.mets.file_groups}
@@ -507,7 +536,8 @@ async def add_file(
page_id: Optional[str] = Form(),
mimetype: str = Form(),
url: Optional[str] = Form(None),
- local_filename: Optional[str] = Form(None)
+ local_filename: Optional[str] = Form(None),
+ force: bool = Form(False),
):
"""
Add a file
@@ -519,7 +549,7 @@ async def add_file(
)
# Add to workspace
kwargs = file_resource.dict()
- workspace.add_file(**kwargs)
+ workspace.add_file(**kwargs, force=force)
return file_resource
# ------------- #
diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py
index 830341393..9e5f5aead 100644
--- a/src/ocrd/processor/base.py
+++ b/src/ocrd/processor/base.py
@@ -377,16 +377,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
pageId=self.page_id, fileGrp=ifg, mimetype=mimetype),
# sort by MIME type so PAGE comes before images
key=lambda file_: file_.mimetype)
- # Warn if no files found but pageId was specified because that
- # might be because of invalid page_id (range)
- if self.page_id and not files_:
- msg = (f"Could not find any files for --page-id {self.page_id} - "
- f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.")
- if on_error == 'abort':
- raise ValueError(msg)
- LOG.warning(msg)
for file_ in files_:
if not file_.pageId:
+ # ignore document-global files
continue
ift = pages.setdefault(file_.pageId, [None]*len(ifgs))
if ift[i]:
@@ -431,13 +424,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'):
else:
LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg)
ift[i] = file_
+ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range)
+ if self.page_id and not any(pages):
+ LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.")
ifts = list()
for page, ifiles in pages.items():
for i, ifg in enumerate(ifgs):
if not ifiles[i]:
# other fallback options?
- LOG.error('found no page %s in file group %s',
- page, ifg)
+ LOG.error(f'Found no page {page} in file group {ifg}')
if ifiles[0] or not require_first:
ifts.append(tuple(ifiles))
return ifts
diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py
index f5b601063..fb5ca1bb0 100644
--- a/src/ocrd/processor/helpers.py
+++ b/src/ocrd/processor/helpers.py
@@ -98,7 +98,7 @@ def run_processor(
ocrd_tool = processor.ocrd_tool
name = '%s v%s' % (ocrd_tool['executable'], processor.version)
- otherrole = ocrd_tool['steps'][0]
+ otherrole = ocrd_tool.get('steps', [''])[0]
logProfile = getLogger('ocrd.process.profile')
log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole)
t0_wall = perf_counter()
@@ -290,6 +290,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None)
'''
information_options = '''\
+ -R, --resolve-resource RESNAME Show the full path of processor resource RESNAME
-C, --show-resource RESNAME Dump the content of processor resource RESNAME
-L, --list-resources List names of processor resources
-J, --dump-json Dump tool description as JSON
diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py
index 44bbd081b..1fc040925 100644
--- a/src/ocrd/resource_manager.py
+++ b/src/ocrd/resource_manager.py
@@ -13,12 +13,16 @@
from gdown.download import get_url_from_gdrive_confirmation
from yaml import safe_load, safe_dump
+# pylint: disable=wrong-import-position
+
# https://github.com/OCR-D/core/issues/867
# https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml
import yaml.constructor
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \
yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str']
+# pylint: enable=wrong-import-position
+
from ocrd_validators import OcrdResourceListValidator
from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config
from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json
@@ -248,7 +252,7 @@ def _download_impl(url, filename, progress_cb=None, size=None):
if "Content-Disposition" not in r.headers:
url = get_url_from_gdrive_confirmation(r.text)
except RuntimeError as e:
- log.warning("Cannot unwrap Google Drive URL: ", e)
+ log.warning("Cannot unwrap Google Drive URL: %s", e)
with open(filename, 'wb') as f:
with requests.get(url, stream=True) as r:
r.raise_for_status()
diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py
index ff856011b..4ef59252a 100644
--- a/src/ocrd/workspace.py
+++ b/src/ocrd/workspace.py
@@ -24,6 +24,7 @@
coordinates_of_segment,
adjust_canvas_to_rotation,
adjust_canvas_to_transposition,
+ scale_coordinates,
shift_coordinates,
rotate_coordinates,
transform_coordinates,
@@ -122,7 +123,10 @@ def reload_mets(self):
"""
Reload METS from the filesystem.
"""
- self.mets = OcrdMets(filename=self.mets_target)
+ if self.is_remote:
+ self.mets.reload()
+ else:
+ self.mets = OcrdMets(filename=self.mets_target)
@deprecated_alias(pageId="page_id")
@deprecated_alias(ID="file_id")
@@ -1150,9 +1154,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh
# Transpose in affine coordinate transform:
# (consistent with image transposition or AlternativeImage below)
transposition = {
- 90: Image.ROTATE_90,
- 180: Image.ROTATE_180,
- 270: Image.ROTATE_270
+ 90: Image.Transpose.ROTATE_90,
+ 180: Image.Transpose.ROTATE_180,
+ 270: Image.Transpose.ROTATE_270
}.get(orientation) # no default
segment_coords['transform'] = transpose_coordinates(
segment_coords['transform'], transposition,
@@ -1220,5 +1224,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa
segment_image = segment_image.resize((int(segment_image.width * factor),
int(segment_image.height * factor)),
# slowest, but highest quality:
- Image.BICUBIC)
+ Image.Resampling.BICUBIC)
return segment_image, segment_coords, segment_xywh
diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py
index a89ee1dec..330fefe97 100644
--- a/src/ocrd_models/__init__.py
+++ b/src/ocrd_models/__init__.py
@@ -5,5 +5,6 @@
from .ocrd_exif import OcrdExif
from .ocrd_file import OcrdFile, ClientSideOcrdFile
from .ocrd_mets import OcrdMets
+from .ocrd_page import OcrdPage
from .ocrd_xml_base import OcrdXmlDocument
from .report import ValidationReport
diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py
index d6da3e1cd..9eedf9fa3 100644
--- a/src/ocrd_models/ocrd_mets.py
+++ b/src/ocrd_models/ocrd_mets.py
@@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]:
"""
return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)]
- def add_agent(self, *args, **kwargs) -> OcrdAgent:
+ def add_agent(self, **kwargs) -> OcrdAgent:
"""
Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``.
"""
@@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent:
el_agent_last.addnext(el_agent)
except StopIteration:
el_metsHdr.insert(0, el_agent)
- return OcrdAgent(el_agent, *args, **kwargs)
+ return OcrdAgent(el_agent, **kwargs)
@property
def file_groups(self) -> List[str]:
@@ -598,7 +598,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI
If return_divs is set, returns div memory objects instead of strings of ids
"""
if for_fileIds is None and for_pageIds is None:
+ if return_divs:
+ if self._cache_flag:
+ return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values())
+
+ return [x for x in self._tree.getroot().xpath(
+ 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]',
+ namespaces=NS)]
+
return self.physical_pages
+
# log = getLogger('ocrd.models.ocrd_mets.get_physical_pages')
if for_pageIds is not None:
ret = []
diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py
index 6fef4c863..f2b7c0551 100644
--- a/src/ocrd_models/ocrd_page_generateds.py
+++ b/src/ocrd_models/ocrd_page_generateds.py
@@ -2,30 +2,28 @@
# -*- coding: utf-8 -*-
#
-# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20.
-# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0]
+# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20.
+# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0]
#
# Command line options:
# ('-f', '')
# ('--root-element', 'PcGts')
-# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py')
+# ('-o', 'src/ocrd_models/ocrd_page_generateds.py')
# ('--silence', '')
# ('--export', 'write etree')
# ('--disable-generatedssuper-lookup', '')
-# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py')
+# ('--user-methods', 'src/ocrd_page_user_methods.py')
#
# Command line arguments:
-# ocrd_validators/ocrd_validators/page.xsd
+# src/ocrd_validators/page.xsd
#
# Command line:
-# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd
+# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd
#
# Current working directory (os.getcwd()):
# core
#
-# type: ignore
-
from itertools import zip_longest
import os
import sys
@@ -223,7 +221,7 @@ def gds_validate_integer_list(
try:
int(value)
except (TypeError, ValueError):
- raise_parse_error(node, 'Requires sequence of integer values')
+ raise_parse_error(node, 'Requires sequence of integer valuess')
return values
def gds_format_float(self, input_data, input_name=''):
return ('%.15f' % input_data).rstrip('0')
@@ -1230,9 +1228,10 @@ def __hash__(self):
return hash(self.id)
@property
def id(self):
+ from ocrd_utils import make_xml_id
if hasattr(self, 'pcGtsId'):
return self.pcGtsId or ''
- return self.imageFilename
+ return make_xml_id(self.imageFilename)
def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True):
"""
Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document.
@@ -3116,9 +3115,10 @@ def __hash__(self):
return hash(self.id)
@property
def id(self):
+ from ocrd_utils import make_xml_id
if hasattr(self, 'pcGtsId'):
return self.pcGtsId or ''
- return self.imageFilename
+ return make_xml_id(self.imageFilename)
# pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring
def _region_class(self, x): # pylint: disable=unused-argument
return x.__class__.__name__.replace('RegionType', '')
@@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True)
ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines))
return ret
+ def get_ReadingOrderGroups(self) -> dict:
+ """
+ Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef
+ (i.e. segment `@id`) to its referring group object (i.e one of
+
+ \b
+ - :py:class:`.RegionRefType`
+ - :py:class:`.RegionRefIndexedType`
+ - :py:class:`.OrderedGroupType`
+ - :py:class:`.OrderedGroupIndexedType`
+ - :py:class:`.UnoderedGroupType`
+ - :py:class:`.UnoderedGroupIndexedType`
+ """
+ def get_groupdict(group):
+ regionrefs = list()
+ if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
+ regionrefs = (group.get_RegionRefIndexed() +
+ group.get_OrderedGroupIndexed() +
+ group.get_UnorderedGroupIndexed())
+ if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
+ regionrefs = (group.get_RegionRef() +
+ group.get_OrderedGroup() +
+ group.get_UnorderedGroup())
+ refdict = {}
+ for elem in regionrefs:
+ refdict[elem.get_regionRef()] = elem
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
+ refdict = {**refdict, **get_groupdict(elem)}
+ return refdict
+ ro = self.get_ReadingOrder()
+ if ro is None:
+ return {}
+ return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup())
def set_orientation(self, orientation):
"""
Set deskewing angle to given `orientation` number.
diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py
index 8a2332e6e..fe22dd89a 100644
--- a/src/ocrd_page_user_methods.py
+++ b/src/ocrd_page_user_methods.py
@@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None):
_add_method(r'^(PageType)$', 'set_Border'),
_add_method(r'^(CoordsType)$', 'set_points'),
_add_method(r'^(PageType)$', 'get_AllTextLines'),
+ _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'),
# for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType:
_add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'),
)
diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py
new file mode 100644
index 000000000..e7d6c02b7
--- /dev/null
+++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py
@@ -0,0 +1,33 @@
+def get_ReadingOrderGroups(self) -> dict:
+ """
+ Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef
+ (i.e. segment `@id`) to its referring group object (i.e one of
+
+ \b
+ - :py:class:`.RegionRefType`
+ - :py:class:`.RegionRefIndexedType`
+ - :py:class:`.OrderedGroupType`
+ - :py:class:`.OrderedGroupIndexedType`
+ - :py:class:`.UnoderedGroupType`
+ - :py:class:`.UnoderedGroupIndexedType`
+ """
+ def get_groupdict(group):
+ regionrefs = list()
+ if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)):
+ regionrefs = (group.get_RegionRefIndexed() +
+ group.get_OrderedGroupIndexed() +
+ group.get_UnorderedGroupIndexed())
+ if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)):
+ regionrefs = (group.get_RegionRef() +
+ group.get_OrderedGroup() +
+ group.get_UnorderedGroup())
+ refdict = {}
+ for elem in regionrefs:
+ refdict[elem.get_regionRef()] = elem
+ if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
+ refdict = {**refdict, **get_groupdict(elem)}
+ return refdict
+ ro = self.get_ReadingOrder()
+ if ro is None:
+ return {}
+ return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup())
diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py
index b5bbcae12..836f01dce 100644
--- a/src/ocrd_utils/__init__.py
+++ b/src/ocrd_utils/__init__.py
@@ -13,6 +13,7 @@
:py:meth:`ocrd.workspace.Workspace.image_from_segment`.)
* :py:func:`rotate_coordinates`,
+ :py:func:`scale_coordinates`,
:py:func:`shift_coordinates`,
:py:func:`transpose_coordinates`,
:py:func:`transform_coordinates`
@@ -148,6 +149,7 @@
polygon_mask,
rotate_coordinates,
rotate_image,
+ scale_coordinates,
shift_coordinates,
transform_coordinates,
transpose_coordinates,
diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py
index 063af930c..418245643 100644
--- a/src/ocrd_utils/config.py
+++ b/src/ocrd_utils/config.py
@@ -68,14 +68,26 @@ def has_default(self, name):
raise ValueError(f"Unregistered env variable {name}")
return self._variables[name].has_default
+ def reset_defaults(self):
+ for name in self._variables:
+ try:
+ # we cannot use hasattr, because that delegates to getattr,
+ # which we override and provide defaults for (which of course
+ # cannot be removed)
+ if self.__getattribute__(name):
+ delattr(self, name)
+ except AttributeError:
+ pass
+
def describe(self, name, *args, **kwargs):
if not name in self._variables:
raise ValueError(f"Unregistered env variable {name}")
return self._variables[name].describe(*args, **kwargs)
def __getattr__(self, name):
+ # will be called if name is not accessible (has not been added directly yet)
if not name in self._variables:
- raise ValueError(f"Unregistered env variable {name}")
+ raise AttributeError(f"Unregistered env variable {name}")
var_obj = self._variables[name]
try:
raw_value = self.raw_value(name)
diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py
index 3bc14e661..6f2524608 100644
--- a/src/ocrd_utils/image.py
+++ b/src/ocrd_utils/image.py
@@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method):
Return a numpy array of the enlarged width and height.
"""
- if method in [Image.ROTATE_90,
- Image.ROTATE_270,
- Image.TRANSPOSE,
- Image.TRANSVERSE]:
+ if method in [Image.Transpose.ROTATE_90,
+ Image.Transpose.ROTATE_270,
+ Image.Transpose.TRANSPOSE,
+ Image.Transpose.TRANSVERSE]:
size = size[::-1]
return size
@@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
calculate the affine coordinate transform corresponding to the composition
of both transformations, which is respectively:
- - ``PIL.Image.FLIP_LEFT_RIGHT``:
+ - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
entails translation to the center, followed by pure reflection
about the y-axis, and subsequent translation back
- - ``PIL.Image.FLIP_TOP_BOTTOM``:
+ - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
entails translation to the center, followed by pure reflection
about the x-axis, and subsequent translation back
- - ``PIL.Image.ROTATE_180``:
+ - ``PIL.Image.Transpose.ROTATE_180``:
entails translation to the center, followed by pure reflection
about the origin, and subsequent translation back
- - ``PIL.Image.ROTATE_90``:
+ - ``PIL.Image.Transpose.ROTATE_90``:
entails translation to the center, followed by pure rotation
by 90° counter-clockwise, and subsequent translation back
- - ``PIL.Image.ROTATE_270``:
+ - ``PIL.Image.Transpose.ROTATE_270``:
entails translation to the center, followed by pure rotation
by 270° counter-clockwise, and subsequent translation back
- - ``PIL.Image.TRANSPOSE``:
+ - ``PIL.Image.Transpose.TRANSPOSE``:
entails translation to the center, followed by pure rotation
by 90° counter-clockwise and pure reflection about the x-axis,
and subsequent translation back
- - ``PIL.Image.TRANSVERSE``:
+ - ``PIL.Image.Transpose.TRANSVERSE``:
entails translation to the center, followed by pure rotation
by 90° counter-clockwise and pure reflection about the y-axis,
and subsequent translation back
@@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])):
[0, 0, 1]])
transform = shift_coordinates(transform, -orig)
operations = {
- Image.FLIP_LEFT_RIGHT: [refly],
- Image.FLIP_TOP_BOTTOM: [reflx],
- Image.ROTATE_180: [reflx, refly],
- Image.ROTATE_90: [rot90],
- Image.ROTATE_270: [rot90, reflx, refly],
- Image.TRANSPOSE: [rot90, reflx],
- Image.TRANSVERSE: [rot90, refly]
+ Image.Transpose.FLIP_LEFT_RIGHT: [refly],
+ Image.Transpose.FLIP_TOP_BOTTOM: [reflx],
+ Image.Transpose.ROTATE_180: [reflx, refly],
+ Image.Transpose.ROTATE_90: [rot90],
+ Image.Transpose.ROTATE_270: [rot90, reflx, refly],
+ Image.Transpose.TRANSPOSE: [rot90, reflx],
+ Image.Transpose.TRANSVERSE: [rot90, refly]
}.get(method) # no default
for operation in operations:
transform = np.dot(operation, transform)
@@ -411,29 +411,29 @@ def transpose_image(image, method):
Given a PIL.Image ``image`` and a transposition mode ``method``,
apply the respective operation:
- - ``PIL.Image.FLIP_LEFT_RIGHT``:
+ - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``:
all pixels get mirrored at half the width of the image
- - ``PIL.Image.FLIP_TOP_BOTTOM``:
+ - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``:
all pixels get mirrored at half the height of the image
- - ``PIL.Image.ROTATE_180``:
+ - ``PIL.Image.Transpose.ROTATE_180``:
all pixels get mirrored at both, the width and half the height
of the image,
i.e. the image gets rotated by 180° counter-clockwise
- - ``PIL.Image.ROTATE_90``:
+ - ``PIL.Image.Transpose.ROTATE_90``:
rows become columns (but counted from the right) and
columns become rows,
i.e. the image gets rotated by 90° counter-clockwise;
width becomes height and vice versa
- - ``PIL.Image.ROTATE_270``:
+ - ``PIL.Image.Transpose.ROTATE_270``:
rows become columns and
columns become rows (but counted from the bottom),
i.e. the image gets rotated by 270° counter-clockwise;
width becomes height and vice versa
- - ``PIL.Image.TRANSPOSE``:
+ - ``PIL.Image.Transpose.TRANSPOSE``:
rows become columns and vice versa,
i.e. all pixels get mirrored at the main diagonal;
width becomes height and vice versa
- - ``PIL.Image.TRANSVERSE``:
+ - ``PIL.Image.Transpose.TRANSVERSE``:
rows become columns (but counted from the right) and
columns become rows (but counted from the bottom),
i.e. all pixels get mirrored at the opposite diagonal;
diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py
index bb771fc0c..ac2b3416a 100644
--- a/src/ocrd_utils/logging.py
+++ b/src/ocrd_utils/logging.py
@@ -211,12 +211,14 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG):
_initialized_flag = False
# logging.basicConfig(level=logging.CRITICAL)
# logging.disable(logging.ERROR)
- # remove all handlers for the ocrd logger
- for logger_name in ROOT_OCRD_LOGGERS:
+ # remove all handlers for the 'ocrd.' and root logger
+ for logger_name in ROOT_OCRD_LOGGERS + ['']:
for handler in logging.getLogger(logger_name).handlers[:]:
logging.getLogger(logger_name).removeHandler(handler)
for logger_name in LOGGING_DEFAULTS:
logging.getLogger(logger_name).setLevel(logging.NOTSET)
+ # Python default log level is WARNING
+ logging.root.setLevel(logging.WARNING)
# Initializing stream handlers at module level
# would cause message output in all runtime contexts,
diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py
index dea3715bf..b3d3ef496 100644
--- a/src/ocrd_utils/str.py
+++ b/src/ocrd_utils/str.py
@@ -105,10 +105,11 @@ def make_xml_id(idstr: str) -> str:
ret = idstr
if not REGEX_FILE_ID.fullmatch(ret):
ret = ret.replace(':', '_')
+ ret = ret.replace('/', '_')
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
ret = re.sub(r'[^\w.-]', r'', ret)
return ret
-
+
def nth_url_segment(url, n=-1):
"""
Return the last /-delimited segment of a URL-like string
diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py
index 36ee3e599..cc58df654 100644
--- a/tests/cli/test_validate.py
+++ b/tests/cli/test_validate.py
@@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self):
json_path.write_text(OCRD_TOOL)
# normal call
- code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)])
- self.assertEqual(code, 0)
+ code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)])
+ self.assertEqual(code, 0, out + err)
# relative path
with pushd_popd(tempdir):
- code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json'])
- self.assertEqual(code, 0)
+ code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json'])
+ self.assertEqual(code, 0, out + err)
# default path
with pushd_popd(tempdir):
- code, _, _ = self.invoke_cli(validate_cli, ['tool-json'])
- self.assertEqual(code, 0)
+ code, out, err = self.invoke_cli(validate_cli, ['tool-json'])
+ self.assertEqual(code, 0, out + err)
def test_validate_parameter(self):
with TemporaryDirectory() as tempdir:
json_path = Path(tempdir, 'ocrd-tool.json')
json_path.write_text(OCRD_TOOL)
with pushd_popd(tempdir):
- code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})])
- self.assertEqual(code, 0)
+ code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})])
+ self.assertEqual(code, 0, out + err)
def test_validate_page(self):
page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml')
@@ -84,19 +84,18 @@ def test_validate_page(self):
def test_validate_tasks(self):
# simple
- code, _, _ = self.invoke_cli(validate_cli, ['tasks',
+ code, out, err = self.invoke_cli(validate_cli, ['tasks',
"sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'",
"sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'",
])
- self.assertEqual(code, 0)
+ self.assertEqual(code, 0, out + err)
# with workspace
code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'),
"sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'",
"sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'",
])
- print('code=%s out=%s err=%s' % (code, out, err))
- self.assertEqual(code, 0)
+ self.assertEqual(code, 0, out + err)
if __name__ == '__main__':
diff --git a/tests/data/__init__.py b/tests/data/__init__.py
index 93a2ea49a..c7fcfb021 100644
--- a/tests/data/__init__.py
+++ b/tests/data/__init__.py
@@ -52,9 +52,9 @@ def process(self):
file_id = make_file_id(input_file, self.output_file_grp)
# print(input_file.ID, file_id)
self.workspace.add_file(
- ID=file_id,
+ file_id=file_id,
file_grp=self.output_file_grp,
- pageId=input_file.pageId,
+ page_id=input_file.pageId,
mimetype=input_file.mimetype,
local_filename=os.path.join(self.output_file_grp, file_id),
content='CONTENT')
diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py
index 739db7625..89742a507 100644
--- a/tests/model/test_ocrd_mets.py
+++ b/tests/model/test_ocrd_mets.py
@@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01):
def test_agent(sbb_sample_01):
beforelen = len(sbb_sample_01.agents)
- sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL')
+ sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL')
assert len(sbb_sample_01.agents) == beforelen + 1
def test_metshdr():
diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py
index 7dc130809..97335775d 100644
--- a/tests/model/test_ocrd_page.py
+++ b/tests/model/test_ocrd_page.py
@@ -460,7 +460,7 @@ def test_id():
# TODO: is this *really* desired?
# I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName
- assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
+ assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif'
if __name__ == '__main__':
diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py
index 784f68fc3..f2261d0ff 100644
--- a/tests/processor/test_processor.py
+++ b/tests/processor/test_processor.py
@@ -6,8 +6,9 @@
from os import environ
from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module
from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor
+from tests.test_mets_server import fixture_start_mets_server
-from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging
+from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config
from ocrd.resolver import Resolver
from ocrd.processor.base import Processor, run_processor, run_cli
@@ -28,6 +29,10 @@ def setUp(self):
self.workspace = self.resolver.workspace_from_url('mets.xml')
self.addCleanup(stack.pop_all().close)
+ def tearDown(self):
+ super().tearDown()
+ config.reset_defaults()
+
def test_incomplete_processor(self):
proc = IncompleteProcessor(None)
with self.assertRaises(NotImplementedError):
@@ -125,8 +130,8 @@ def test_run_input(self):
def test_run_output0(self):
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
- ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
- ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002')
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002')
run_processor(DummyProcessorWithOutput, workspace=ws,
input_file_grp="GRP1",
output_file_grp="OCR-D-OUT")
@@ -135,10 +140,10 @@ def test_run_output0(self):
def test_run_output_overwrite(self):
with pushd_popd(tempdir=True) as tempdir:
ws = self.resolver.workspace_from_nothing(directory=tempdir)
- ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001')
- ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002')
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001')
+ ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002')
ws.overwrite_mode = True
- ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001')
+ ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001')
ws.overwrite_mode = False
with pytest.raises(Exception) as exc:
run_processor(DummyProcessorWithOutput, workspace=ws,
@@ -242,7 +247,29 @@ class ZipTestProcessor(Processor): pass
proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id)
assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')]
r = self.capture_out_err()
- assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err
+ assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err
+
+def test_run_output_metsserver(start_mets_server):
+ mets_server_url, ws = start_mets_server
+ run_processor(DummyProcessorWithOutput, workspace=ws,
+ input_file_grp="OCR-D-IMG",
+ output_file_grp="OCR-D-OUT",
+ mets_server_url=mets_server_url)
+ assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
+ ws.overwrite_mode = True
+ run_processor(DummyProcessorWithOutput, workspace=ws,
+ input_file_grp="OCR-D-IMG",
+ output_file_grp="OCR-D-OUT",
+ mets_server_url=mets_server_url)
+ assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG"))
+ ws.overwrite_mode = False
+ with pytest.raises(Exception) as exc:
+ run_processor(DummyProcessorWithOutput, workspace=ws,
+ input_file_grp="OCR-D-IMG",
+ output_file_grp="OCR-D-OUT",
+ mets_server_url=mets_server_url)
+ assert "already exists" in str(exc.value)
+
if __name__ == "__main__":
main(__file__)
diff --git a/tests/test_decorators.py b/tests/test_decorators.py
index 5ab288005..c36577020 100644
--- a/tests/test_decorators.py
+++ b/tests/test_decorators.py
@@ -15,7 +15,7 @@
ocrd_loglevel,
ocrd_cli_wrap_processor,
) # pylint: disable=protected-access
-from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files
+from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config
@click.command()
@ocrd_cli_options
@@ -45,6 +45,10 @@ def setUp(self):
super().setUp()
disableLogging()
+ def tearDown(self):
+ super().tearDown()
+ config.reset_defaults()
+
def test_minimal(self):
exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG'])
print(out, err)
@@ -64,6 +68,7 @@ def test_loglevel_override(self):
pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test")
import logging
disableLogging()
+ assert logging.getLogger('').getEffectiveLevel() == logging.WARNING
assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING
initLogging()
assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO
diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py
index 58ff6e2a9..dc94d6c56 100644
--- a/tests/test_mets_server.py
+++ b/tests/test_mets_server.py
@@ -22,13 +22,16 @@
from requests.exceptions import ConnectionError
from ocrd import Resolver, OcrdMetsServer, Workspace
-from ocrd_utils import pushd_popd, MIMETYPE_PAGE
+from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel
-WORKSPACE_DIR = '/tmp/ocrd-mets-server'
TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345']
+initLogging()
+setOverrideLogLevel(10)
+
@fixture(scope='function', name='start_mets_server', params=TRANSPORTS)
-def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]:
+def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]:
+ tmpdir = str(tmpdir)
def _start_mets_server(*args, **kwargs):
mets_server = OcrdMetsServer(*args, **kwargs)
mets_server.startup()
@@ -39,33 +42,35 @@ def _start_mets_server(*args, **kwargs):
if exists(mets_server_url):
remove(mets_server_url)
- if exists(WORKSPACE_DIR):
- rmtree(WORKSPACE_DIR, ignore_errors=True)
+ if exists(tmpdir):
+ rmtree(tmpdir, ignore_errors=True)
- copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR)
- workspace = Workspace(Resolver(), WORKSPACE_DIR)
+ copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir)
+ workspace = Workspace(Resolver(), tmpdir)
p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param})
p.start()
sleep(1) # sleep to start up server
- yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url)
+ workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url)
+ yield mets_server_url, workspace_server
p.terminate()
- rmtree(WORKSPACE_DIR, ignore_errors=True)
+ rmtree(tmpdir, ignore_errors=True)
-def add_file_server(x):
- mets_server_url, i = x
- workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url)
+def add_file_server(x, force=False):
+ mets_server_url, directory, i = x
+ workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url)
workspace_server.add_file(
+ 'FOO',
local_filename=f'local_filename{i}',
mimetype=MIMETYPE_PAGE,
page_id=f'page{i}',
- file_grp='FOO',
file_id=f'FOO_page{i}_foo{i}',
# url=f'url{i}'
+ force=force
)
def add_agent_server(x):
- mets_server_url, i = x
- workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url)
+ mets_server_url, directory, i = x
+ workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url)
workspace_server.mets.add_agent(
name=f'proc{i}',
_type='baz',
@@ -82,7 +87,10 @@ def test_mets_server_add_file(start_mets_server):
# add NO_FILES files in parallel
with Pool() as pool:
- pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES)))
+ pool.map(add_file_server, zip(
+ repeat(mets_server_url),
+ repeat(workspace_server.directory),
+ range(NO_FILES)))
assert set(workspace_server.mets.file_groups) == set( [
'OCR-D-IMG',
@@ -107,7 +115,7 @@ def test_mets_server_add_file(start_mets_server):
assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES
# not yet synced
- workspace_file = Workspace(Resolver(), WORKSPACE_DIR)
+ workspace_file = Workspace(Resolver(), workspace_server.directory)
assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0
# sync
@@ -116,6 +124,19 @@ def test_mets_server_add_file(start_mets_server):
assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES
+def test_mets_server_add_file_overwrite(start_mets_server):
+ mets_server_url, workspace_server = start_mets_server
+
+ add_file_server((mets_server_url, workspace_server.directory, 5))
+
+ assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1
+
+ with raises(RuntimeError, match="already exists"):
+ add_file_server((mets_server_url, workspace_server.directory, 5))
+
+ add_file_server((mets_server_url, workspace_server.directory, 5), force=True)
+ assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1
+
def test_mets_server_add_agents(start_mets_server):
NO_AGENTS = 30
@@ -125,13 +146,16 @@ def test_mets_server_add_agents(start_mets_server):
# add NO_AGENTS agents in parallel
with Pool() as pool:
- pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS))))
+ pool.map(add_agent_server, zip(
+ repeat(mets_server_url),
+ repeat(workspace_server.directory),
+ list(range(NO_AGENTS))))
assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before
# XXX not a tuple
assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'}
- workspace_file = Workspace(Resolver(), WORKSPACE_DIR)
+ workspace_file = Workspace(Resolver(), workspace_server.directory)
assert len(workspace_file.mets.agents) == no_agents_before
# sync
@@ -142,7 +166,7 @@ def test_mets_server_add_agents(start_mets_server):
def test_mets_server_str(start_mets_server):
mets_server_url, workspace_server = start_mets_server
- workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url)
+ workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url)
f = next(workspace_server.find_files())
assert str(f) == ''
a = workspace_server.mets.agents[0]
@@ -182,7 +206,7 @@ def test_mets_server_socket_stop(start_mets_server):
assert True, 'No stop conditions to test for TCP server'
else:
assert Path(mets_server_url).exists()
- assert workspace_server.mets.workspace_path == WORKSPACE_DIR
+ assert workspace_server.mets.workspace_path == workspace_server.directory
workspace_server.mets.stop()
with raises(ConnectionError):
workspace_server.mets.file_groups
@@ -236,7 +260,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]):
assert len(workspace_server.mets.find_all_files()) == 35, '35 files total'
assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total'
- workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo')
+ workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo')
assert len(workspace_server.mets.find_all_files()) == 35, '35 files total'
assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total'
diff --git a/tests/test_resolver.py b/tests/test_resolver.py
index 16dfd03d5..c2575b608 100644
--- a/tests/test_resolver.py
+++ b/tests/test_resolver.py
@@ -292,20 +292,21 @@ def test_resolve_mets_arguments():
https://github.com/OCR-D/core/issues/517
"""
resolver = Resolver()
- assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None)
- assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None)
- assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
- assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
- assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
- assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None)
- with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"):
- resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None)
- with pytest.raises(ValueError, match="inconsistent with --directory"):
- resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None)
- with pytest.warns(DeprecationWarning):
- resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None)
- with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"):
- resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None)
+ with pytest.warns(DeprecationWarning, match='--mets-basename'):
+ assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None)
+ assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None)
+ assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
+ assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
+ assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None)
+ assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None)
+ with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"):
+ resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None)
+ with pytest.raises(ValueError, match="inconsistent with --directory"):
+ resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None)
+ with pytest.warns(DeprecationWarning):
+ resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None)
+ with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"):
+ resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None)
if __name__ == '__main__':
main(__file__)
diff --git a/tests/test_workspace.py b/tests/test_workspace.py
index c8df9b444..75e9b6886 100644
--- a/tests/test_workspace.py
+++ b/tests/test_workspace.py
@@ -734,7 +734,7 @@ def _fixture_metsDocumentID(tmp_path):
def test_agent_before_metsDocumentID(workspace_metsDocumentID):
report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target)
assert report.is_valid
- workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER')
+ workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER')
workspace_metsDocumentID.save_mets()
report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target)
print(report.errors)
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index 99595a864..a94eb5d3c 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -57,3 +57,11 @@ def test_OCRD_PROFILE():
with temp_env_var('OCRD_PROFILE', 'some other value'):
with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"):
config.OCRD_PROFILE
+
+def test_defaults():
+ default = config.OCRD_MAX_PROCESSOR_CACHE
+ print(type(default))
+ config.OCRD_MAX_PROCESSOR_CACHE = 2
+ assert config.OCRD_MAX_PROCESSOR_CACHE == 2
+ config.reset_defaults()
+ assert config.OCRD_MAX_PROCESSOR_CACHE == default
diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py
index 8a8387d4b..bd756879b 100644
--- a/tests/validator/test_json_validator.py
+++ b/tests/validator/test_json_validator.py
@@ -20,18 +20,18 @@ def setUp(self):
def test_validate_string(self):
report = JsonValidator.validate('{}', {})
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
def test_defaults_set(self):
obj = {'bar': 2000}
report = self.defaults_validator._validate(obj)
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
self.assertEqual(obj, {'foo': 3000, 'bar': 2000})
def test_properr(self):
obj = {'bar': 100, 'quux': {}}
report = self.defaults_validator._validate(obj)
- self.assertFalse(report.is_valid)
+ self.assertFalse(report.is_valid, str(report.to_xml()))
self.assertEqual(len(report.errors), 1)
diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py
index 3ad40d864..70d40c2f2 100644
--- a/tests/validator/test_ocrd_tool_validator.py
+++ b/tests/validator/test_ocrd_tool_validator.py
@@ -29,7 +29,7 @@ def setUp(self):
def test_smoke(self):
report = OcrdToolValidator.validate(self.ocrd_tool)
- self.assertEqual(report.is_valid, True)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
def test_additional_props(self):
self.ocrd_tool['not-allowed'] = 'YUP'
@@ -48,7 +48,7 @@ def test_file_param_ok(self):
ocrd_tool = json.loads(skeleton)
ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}}
report = OcrdToolValidator.validate(ocrd_tool)
- self.assertEqual(report.is_valid, True)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
# Not restricted anymore since spec 3.3.0
# def test_file_param_bad_content_types(self):
diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py
index 79e92d90f..e6aaff152 100644
--- a/tests/validator/test_page_validator.py
+++ b/tests/validator/test_page_validator.py
@@ -16,9 +16,10 @@ def test_validate_err(self):
PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best')
# test with deprecated name
with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'):
- PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best')
+ with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'):
+ PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best')
with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'):
- PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first')
+ PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first')
def test_validate_filename(self):
report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME)
@@ -44,7 +45,7 @@ def test_validate_lax(self):
report = PageValidator.validate(ocrd_page=ocrd_page)
self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict')
- report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax')
+ report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax')
self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax')
def test_validate_multi_textequiv_first(self):
@@ -89,7 +90,7 @@ def test_fix(self):
ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True)
report = PageValidator.validate(ocrd_page=ocrd_page)
self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
- PageValidator.validate(ocrd_page=ocrd_page, strictness='fix')
+ PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix')
report = PageValidator.validate(ocrd_page=ocrd_page)
self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors')
diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py
index f0d9d41d2..297a14906 100644
--- a/tests/validator/test_parameter_validator.py
+++ b/tests/validator/test_parameter_validator.py
@@ -42,7 +42,7 @@ def test_default_assignment(self):
})
obj = {'baz': '23'}
report = validator.validate(obj)
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
self.assertEqual(obj, {'baz': '23', "num-param": 1})
def test_min_max():
diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py
index eb95d9b1e..cc63c30ea 100644
--- a/tests/validator/test_resource_list_validator.py
+++ b/tests/validator/test_resource_list_validator.py
@@ -22,8 +22,7 @@ def reslist():
def test_resource_list_validator(reslist):
report = OcrdResourceListValidator.validate(reslist)
- print(report.errors)
- assert report.is_valid == True
+ assert report.is_valid, str(report.to_xml())
if __name__ == '__main__':
main(__file__)
diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py
index d0150338d..50b3851ff 100644
--- a/tests/validator/test_xsd_validator.py
+++ b/tests/validator/test_xsd_validator.py
@@ -37,22 +37,22 @@ def test_mets_empty(self):
def test_validate_simple_protected_str(self):
val = XsdValidator(XSD_METS_URL)
report = val._validate(self.ws.mets.to_xml())
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
def test_validate_simple_protected_doc(self):
val = XsdValidator(XSD_METS_URL)
report = val._validate(self.ws.mets._tree)
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
def test_validate_simple_static_doc(self):
report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree)
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
class TestXsdPageValidator(TestCase):
def test_validate_page_simple_static_doc(self):
report = XsdPageValidator.validate(simple_page)
- self.assertTrue(report.is_valid)
+ self.assertTrue(report.is_valid, str(report.to_xml()))
if __name__ == '__main__':
main(__file__)