diff --git a/.gitignore b/.gitignore index e8dcdde7c..c04045229 100644 --- a/.gitignore +++ b/.gitignore @@ -123,3 +123,4 @@ test/assets/00000001.jpg test/assets/test.ocrd.zip /foo sanders* +ws1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 7826e57dd..f20263801 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +Changed: + + * Remove dependency on `xmllint` command line tool, #72, #151 + ## [0.7.0] - 2018-07-25 Changed: diff --git a/Makefile b/Makefile index 3bc202829..34610e8dc 100644 --- a/Makefile +++ b/Makefile @@ -41,8 +41,7 @@ deps-ubuntu: sudo apt install -y \ python3 \ python3-pip \ - libimage-exiftool-perl \ - libxml2-utils + libimage-exiftool-perl # Install python deps via pip deps-pip: diff --git a/README.rst b/README.rst index 2297bff62..68ae3894b 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,6 @@ To bootstrap the tool, you'll need installed (Ubuntu packages): * Python (``python``) * pip (``python-pip``) * exiftool (``libimage-exiftool-perl``) -* libxml2-utils for xmllint (``libxml2-utils``) To install system-wide: diff --git a/ocrd/utils.py b/ocrd/utils.py index 081f2c285..5754afdeb 100644 --- a/ocrd/utils.py +++ b/ocrd/utils.py @@ -12,7 +12,6 @@ xywh is what tesserocr expects/produces. """ -import subprocess import logging import re import sys @@ -72,15 +71,14 @@ def polygon_from_points(points): polygon.append([float(x_y[0]), float(x_y[1])]) return polygon -# https://stackoverflow.com/a/10133365/201318 def xmllint_format(xml): - proc = subprocess.Popen( - ['xmllint', '--format', '/dev/stdin'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - ) - output, _ = proc.communicate(xml) - return output + from lxml import etree as ET + parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True) + document = ET.fromstring(xml, parser) + return ('%s\n%s' % ( + '', + ET.tostring(document, pretty_print=True) + )).encode('utf-8') def is_string(val): # pylint: disable=undefined-variable diff --git a/test/test_utils.py b/test/test_utils.py index 78c903e8c..97e03e606 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,15 @@ -from test.base import TestCase, main +from test.base import TestCase, main, assets from ocrd.utils import ( points_from_xywh, is_string, concat_padded, points_from_x0y0x1y1, xywh_from_points, - polygon_from_points + polygon_from_points, + xmllint_format ) -class TestResolver(TestCase): +class TestUtils(TestCase): # def runTest(self): @@ -44,5 +45,12 @@ def test_is_string(self): self.assertTrue(is_string('x')) self.assertTrue(is_string(u'x')) + # def test_xmllint(self): + # with open(assets.path_to('page-with-glyphs.xml'), 'rb') as f: + # xml_as_str = f.read() + # pretty_xml = xmllint_format(xml_as_str) + # self.assertEqual(xml_as_str, pretty_xml) + + if __name__ == '__main__': main()