diff --git a/src/main/python/ttconv/vtt/paragraph.py b/src/main/python/ttconv/vtt/paragraph.py index 748ee8e6..088a891e 100644 --- a/src/main/python/ttconv/vtt/paragraph.py +++ b/src/main/python/ttconv/vtt/paragraph.py @@ -25,6 +25,7 @@ """WebVTT paragraph""" +import re from fractions import Fraction from typing import Optional, Union @@ -34,6 +35,8 @@ class VttParagraph: """VTT paragraph definition class""" + _EOL_SEQ_RE = re.compile(r"\n{2,}") + def __init__(self, identifier: int): self._id: int = identifier self._begin: Optional[ClockTime] = None @@ -61,9 +64,14 @@ def get_end(self) -> Optional[ClockTime]: """Returns the paragraph end time code""" return self._end - def is_only_whitespace(self): - """Returns whether the paragraph tex contains only whitespace""" - return self._text.isspace() + def is_only_whitespace_or_empty(self): + """Returns whether the paragraph text contains only whitespace or is empty""" + return len(self._text) == 0 or self._text.isspace() + + def normalize_eol(self): + """Remove line breaks at the beginning and end of the paragraph, and replace + line break sequences with a single line break""" + self._text = VttParagraph._EOL_SEQ_RE.sub("\n", self._text).strip("\n\r") def append_text(self, text: str): """Appends text to the paragraph""" diff --git a/src/main/python/ttconv/vtt/writer.py b/src/main/python/ttconv/vtt/writer.py index 36c92718..e9419c6f 100644 --- a/src/main/python/ttconv/vtt/writer.py +++ b/src/main/python/ttconv/vtt/writer.py @@ -102,6 +102,13 @@ def append_element(self, element: model.ContentElement, begin: Fraction, end: Op for elem in list(element): self.append_element(elem, begin, end) + + self._paragraphs[-1].normalize_eol() + + if self._paragraphs[-1].is_only_whitespace_or_empty(): + LOGGER.debug("Removing empty paragraph.") + self._paragraphs.pop() + self._captions_counter -= 1 if isinstance(element, model.Span): is_bold = style.is_element_bold(element) @@ -185,7 +192,7 @@ def finish(self): LOGGER.debug("Check and process the last VTT paragraph.") if self._paragraphs and self._paragraphs[-1].get_end() is None: - if self._paragraphs[-1].is_only_whitespace(): + if self._paragraphs[-1].is_only_whitespace_or_empty(): # if the last paragraph contains only whitespace, remove it LOGGER.debug("Removing empty unbounded last paragraph.") self._paragraphs.pop() diff --git a/src/test/python/test_vtt_paragraph.py b/src/test/python/test_vtt_paragraph.py index 2537a12b..d055c30f 100644 --- a/src/test/python/test_vtt_paragraph.py +++ b/src/test/python/test_vtt_paragraph.py @@ -38,19 +38,16 @@ class VttParagraphTest(unittest.TestCase): def test_paragraph(self): paragraph = VttParagraph(123) - self.assertEqual("123\nNone --> None\n", str(paragraph)) self.assertRaisesRegex(ValueError, "VTT paragraph begin time code must be set.", paragraph.to_string) paragraph.set_begin(Fraction(1234, 1)) self.assertEqual("00:20:34.000", str(paragraph.get_begin())) - self.assertEqual("123\n00:20:34.000 --> None\n", str(paragraph)) self.assertRaisesRegex(ValueError, "VTT paragraph end time code must be set.", paragraph.to_string) paragraph.set_end(Fraction(1234, 1)) self.assertEqual("00:20:34.000", str(paragraph.get_end())) - self.assertEqual("123\n00:20:34.000 --> 00:20:34.000\n", str(paragraph)) self.assertRaisesRegex(ValueError, "VTT paragraph end time code must be greater than the begin time code.", paragraph.to_string) paragraph.set_end(Fraction(2345, 1)) diff --git a/src/test/python/test_vtt_writer.py b/src/test/python/test_vtt_writer.py index 2525b0e1..d4a752f9 100644 --- a/src/test/python/test_vtt_writer.py +++ b/src/test/python/test_vtt_writer.py @@ -222,5 +222,22 @@ def _check_output_vtt(self, model: ContentDocument, vtt: str, path: str): else: self.assertEqual(8, len(vtt), msg=f"Could not convert {path}") + def test_empty_isds(self): + tree = et.parse('src/test/resources/ttml/imsc-tests/imsc1/ttml/timing/BasicTiming010.ttml') + doc = imsc_reader.to_model(tree) + srt_from_model = vtt_writer.from_model(doc) + + self.assertEqual(srt_from_model, """WEBVTT + +1 +00:00:10.000 --> 00:00:24.400 +This text must appear at 10 seconds and disappear at 24.4 seconds + +2 +00:00:25.000 --> 00:00:35.000 +This text must appear at 25 seconds and disappear at 35 seconds +""") + + if __name__ == '__main__': unittest.main()