From ec5258d0ce3d47767d528df69285d85ff6d4c29d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 26 Jul 2018 18:21:55 +0200 Subject: [PATCH 1/3] replace xmllint callout with lxml solution, fix #72 --- .gitignore | 1 + Makefile | 3 +-- README.rst | 1 - ocrd/utils.py | 14 ++++++-------- test/test_utils.py | 14 +++++++++++--- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index e8dcdde7c..c04045229 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,4 @@ test/assets/00000001.jpg test/assets/test.ocrd.zip /foo sanders* +ws1 diff --git a/Makefile b/Makefile index 3bc202829..34610e8dc 100644 --- a/Makefile +++ b/Makefile @@ -41,8 +41,7 @@ deps-ubuntu: sudo apt install -y \ python3 \ python3-pip \ - libimage-exiftool-perl \ - libxml2-utils + libimage-exiftool-perl # Install python deps via pip deps-pip: diff --git a/README.rst b/README.rst index 2297bff62..68ae3894b 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,6 @@ To bootstrap the tool, you'll need installed (Ubuntu packages): * Python (``python``) * pip (``python-pip``) * exiftool (``libimage-exiftool-perl``) -* libxml2-utils for xmllint (``libxml2-utils``) To install system-wide: diff --git a/ocrd/utils.py b/ocrd/utils.py index 081f2c285..59cf5a30b 100644 --- a/ocrd/utils.py +++ b/ocrd/utils.py @@ -12,7 +12,6 @@ xywh is what tesserocr expects/produces. """ -import subprocess import logging import re import sys @@ -72,15 +71,14 @@ def polygon_from_points(points): polygon.append([float(x_y[0]), float(x_y[1])]) return polygon -# https://stackoverflow.com/a/10133365/201318 def xmllint_format(xml): - proc = subprocess.Popen( - ['xmllint', '--format', '/dev/stdin'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, + from lxml import etree as ET + parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) + document = ET.fromstring(xml, parser) + return '%s\n%s' % ( + '', + ET.tostring(document, pretty_print=True) ) - output, _ = proc.communicate(xml) - return output def is_string(val): # pylint: disable=undefined-variable diff --git a/test/test_utils.py b/test/test_utils.py index 78c903e8c..97e03e606 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,15 @@ -from test.base import TestCase, main +from test.base import TestCase, main, assets from ocrd.utils import ( points_from_xywh, is_string, concat_padded, points_from_x0y0x1y1, xywh_from_points, - polygon_from_points + polygon_from_points, + xmllint_format ) -class TestResolver(TestCase): +class TestUtils(TestCase): # def runTest(self): @@ -44,5 +45,12 @@ def test_is_string(self): self.assertTrue(is_string('x')) self.assertTrue(is_string(u'x')) + # def test_xmllint(self): + # with open(assets.path_to('page-with-glyphs.xml'), 'rb') as f: + # xml_as_str = f.read() + # pretty_xml = xmllint_format(xml_as_str) + # self.assertEqual(xml_as_str, pretty_xml) + + if __name__ == '__main__': main() From a9704aaacadcc69cdc5dcca8cbfcb7b4d1553de3 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 26 Jul 2018 18:31:41 +0200 Subject: [PATCH 2/3] :memo: changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7826e57dd..f20263801 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * Remove dependency on `xmllint` command line tool, #72, #151 + ## [0.7.0] - 2018-07-25 Changed: From e7cf5e5025f7a97540a3fa07cee736e928f67665 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 26 Jul 2018 18:46:33 +0200 Subject: [PATCH 3/3] fix xmllint_format by encoding as utf-8 and keeping fingers crossed that it will work in 27 34 and 36 --- ocrd/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd/utils.py b/ocrd/utils.py index 59cf5a30b..5754afdeb 100644 --- a/ocrd/utils.py +++ b/ocrd/utils.py @@ -75,10 +75,10 @@ def xmllint_format(xml): from lxml import etree as ET parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) document = ET.fromstring(xml, parser) - return '%s\n%s' % ( + return ('%s\n%s' % ( '', ET.tostring(document, pretty_print=True) - ) + )).encode('utf-8') def is_string(val): # pylint: disable=undefined-variable