Skip to content

Commit

Permalink
Merge pull request #151 from kba/replace-xmllint
Browse files Browse the repository at this point in the history
replace xmllint callout with lxml solution, fix #72
  • Loading branch information
kba authored Jul 26, 2018
2 parents fdc6051 + e7cf5e5 commit a8c14f8
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,4 @@ test/assets/00000001.jpg
test/assets/test.ocrd.zip
/foo
sanders*
ws1
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Changed:

* Remove dependency on `xmllint` command line tool, #72, #151

## [0.7.0] - 2018-07-25

Changed:
Expand Down
3 changes: 1 addition & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ deps-ubuntu:
sudo apt install -y \
python3 \
python3-pip \
libimage-exiftool-perl \
libxml2-utils
libimage-exiftool-perl

# Install python deps via pip
deps-pip:
Expand Down
1 change: 0 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ To bootstrap the tool, you'll need installed (Ubuntu packages):
* Python (``python``)
* pip (``python-pip``)
* exiftool (``libimage-exiftool-perl``)
* libxml2-utils for xmllint (``libxml2-utils``)

To install system-wide:

Expand Down
16 changes: 7 additions & 9 deletions ocrd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
xywh is what tesserocr expects/produces.
"""
import subprocess
import logging
import re
import sys
Expand Down Expand Up @@ -72,15 +71,14 @@ def polygon_from_points(points):
polygon.append([float(x_y[0]), float(x_y[1])])
return polygon

# https://stackoverflow.com/a/10133365/201318
def xmllint_format(xml):
proc = subprocess.Popen(
['xmllint', '--format', '/dev/stdin'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
)
output, _ = proc.communicate(xml)
return output
from lxml import etree as ET
parser = ET.XMLParser(resolve_entities=False, strip_cdata=False, remove_blank_text=True)
document = ET.fromstring(xml, parser)
return ('%s\n%s' % (
'<?xml version="1.0" encoding="UTF-8"?>',
ET.tostring(document, pretty_print=True)
)).encode('utf-8')

def is_string(val):
# pylint: disable=undefined-variable
Expand Down
14 changes: 11 additions & 3 deletions test/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from test.base import TestCase, main
from test.base import TestCase, main, assets
from ocrd.utils import (
points_from_xywh,
is_string,
concat_padded,
points_from_x0y0x1y1,
xywh_from_points,
polygon_from_points
polygon_from_points,
xmllint_format
)

class TestResolver(TestCase):
class TestUtils(TestCase):

# def runTest(self):

Expand Down Expand Up @@ -44,5 +45,12 @@ def test_is_string(self):
self.assertTrue(is_string('x'))
self.assertTrue(is_string(u'x'))

# def test_xmllint(self):
# with open(assets.path_to('page-with-glyphs.xml'), 'rb') as f:
# xml_as_str = f.read()
# pretty_xml = xmllint_format(xml_as_str)
# self.assertEqual(xml_as_str, pretty_xml)


if __name__ == '__main__':
main()

0 comments on commit a8c14f8

Please sign in to comment.