Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VTT writer: port SRT writer improvments (#343) #349

Merged
merged 1 commit into from
Nov 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/main/python/ttconv/vtt/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

"""WebVTT paragraph"""

import re
from fractions import Fraction
from typing import Optional, Union

Expand All @@ -34,6 +35,8 @@
class VttParagraph:
"""VTT paragraph definition class"""

_EOL_SEQ_RE = re.compile(r"\n{2,}")

def __init__(self, identifier: int):
self._id: int = identifier
self._begin: Optional[ClockTime] = None
Expand Down Expand Up @@ -61,9 +64,14 @@ def get_end(self) -> Optional[ClockTime]:
"""Returns the paragraph end time code"""
return self._end

def is_only_whitespace(self):
"""Returns whether the paragraph tex contains only whitespace"""
return self._text.isspace()
def is_only_whitespace_or_empty(self):
"""Returns whether the paragraph text contains only whitespace or is empty"""
return len(self._text) == 0 or self._text.isspace()

def normalize_eol(self):
"""Remove line breaks at the beginning and end of the paragraph, and replace
line break sequences with a single line break"""
self._text = VttParagraph._EOL_SEQ_RE.sub("\n", self._text).strip("\n\r")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about also normalizing whitespaces per #350?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@palemieux I think #350 is not critical as xml:preserve is not used (at least from observation) very often, if at all. It is also not recommended to use it because the result can not be predicted and depends on the selection of fonts. So I think it is safe to wait with this issue and prioritize other issues higher. If the review is OK for this PR it is ready to be merged from my view.


def append_text(self, text: str):
"""Appends text to the paragraph"""
Expand Down
9 changes: 8 additions & 1 deletion src/main/python/ttconv/vtt/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ def append_element(self, element: model.ContentElement, begin: Fraction, end: Op

for elem in list(element):
self.append_element(elem, begin, end)

self._paragraphs[-1].normalize_eol()

if self._paragraphs[-1].is_only_whitespace_or_empty():
LOGGER.debug("Removing empty paragraph.")
self._paragraphs.pop()
self._captions_counter -= 1

if isinstance(element, model.Span):
is_bold = style.is_element_bold(element)
Expand Down Expand Up @@ -185,7 +192,7 @@ def finish(self):
LOGGER.debug("Check and process the last VTT paragraph.")

if self._paragraphs and self._paragraphs[-1].get_end() is None:
if self._paragraphs[-1].is_only_whitespace():
if self._paragraphs[-1].is_only_whitespace_or_empty():
# if the last paragraph contains only whitespace, remove it
LOGGER.debug("Removing empty unbounded last paragraph.")
self._paragraphs.pop()
Expand Down
3 changes: 0 additions & 3 deletions src/test/python/test_vtt_paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,16 @@ class VttParagraphTest(unittest.TestCase):
def test_paragraph(self):
paragraph = VttParagraph(123)

self.assertEqual("123\nNone --> None\n", str(paragraph))
self.assertRaisesRegex(ValueError, "VTT paragraph begin time code must be set.", paragraph.to_string)

paragraph.set_begin(Fraction(1234, 1))
self.assertEqual("00:20:34.000", str(paragraph.get_begin()))

self.assertEqual("123\n00:20:34.000 --> None\n", str(paragraph))
self.assertRaisesRegex(ValueError, "VTT paragraph end time code must be set.", paragraph.to_string)

paragraph.set_end(Fraction(1234, 1))
self.assertEqual("00:20:34.000", str(paragraph.get_end()))

self.assertEqual("123\n00:20:34.000 --> 00:20:34.000\n", str(paragraph))
self.assertRaisesRegex(ValueError, "VTT paragraph end time code must be greater than the begin time code.", paragraph.to_string)

paragraph.set_end(Fraction(2345, 1))
Expand Down
17 changes: 17 additions & 0 deletions src/test/python/test_vtt_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,5 +222,22 @@ def _check_output_vtt(self, model: ContentDocument, vtt: str, path: str):
else:
self.assertEqual(8, len(vtt), msg=f"Could not convert {path}")

def test_empty_isds(self):
tree = et.parse('src/test/resources/ttml/imsc-tests/imsc1/ttml/timing/BasicTiming010.ttml')
doc = imsc_reader.to_model(tree)
srt_from_model = vtt_writer.from_model(doc)

self.assertEqual(srt_from_model, """WEBVTT

1
00:00:10.000 --> 00:00:24.400
This text must appear at 10 seconds and disappear at 24.4 seconds

2
00:00:25.000 --> 00:00:35.000
This text must appear at 25 seconds and disappear at 35 seconds
""")


if __name__ == '__main__':
unittest.main()