Skip to content

Commit

Permalink
fixing issue #169
Browse files Browse the repository at this point in the history
  • Loading branch information
Mathieu Bernard committed Jun 28, 2024
1 parent d9f9ed2 commit bae418c
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 8 deletions.
2 changes: 1 addition & 1 deletion phonemizer/backend/espeak/espeak.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _phonemize_postprocess(self, phonemized, punctuation_marks, separator: Separ
text = phonemized[0]
switches = phonemized[1]

self._words_mismatch.count_phonemized(text)
self._words_mismatch.count_phonemized(text, separator)
self._lang_switch.warning(switches)

phonemized = super()._phonemize_postprocess(text, punctuation_marks, separator, strip)
Expand Down
19 changes: 14 additions & 5 deletions phonemizer/backend/espeak/words_mismatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
from logging import Logger
from typing import List, Tuple

from typing_extensions import TypeAlias, Literal
from typing_extensions import TypeAlias, Literal, Union

from phonemizer.separator import Separator


WordMismatch: TypeAlias = Literal["warn", "ignore"]

Expand Down Expand Up @@ -58,10 +61,16 @@ def __init__(self, logger: Logger):
self._count_phn = []

@classmethod
def _count_words(cls, text: List[str]) -> List[int]:
def _count_words(
cls,
text: List[str],
wordsep: Union[str, re.Pattern] = _RE_SPACES) -> List[int]:
"""Return the number of words contained in each line of `text`"""
if not isinstance(wordsep, re.Pattern):
wordsep = re.escape(wordsep)

return [
len([w for w in cls._RE_SPACES.split(line.strip()) if w])
len([w for w in re.split(wordsep, line.strip()) if w])
for line in text]

def _mismatched_lines(self) -> List[Tuple[int, int, int]]:
Expand Down Expand Up @@ -93,9 +102,9 @@ def count_text(self, text: List[str]):
"""Stores the number of words in each input line"""
self._count_txt = self._count_words(text)

def count_phonemized(self, text: List[str]):
def count_phonemized(self, text: List[str], separator: Separator):
"""Stores the number of words in each output line"""
self._count_phn = self._count_words(text)
self._count_phn = self._count_words(text, separator.word)

@abc.abstractmethod
def process(self, text: List[str]) -> List[str]:
Expand Down
22 changes: 20 additions & 2 deletions test/test_espeak_word_mismatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

import pytest

from phonemizer.backend.espeak.words_mismatch import Ignore
import re

from phonemizer import phonemize
from phonemizer.backend.espeak.words_mismatch import Ignore
from phonemizer.separator import Separator, default_separator


@pytest.fixture
Expand All @@ -16,7 +19,8 @@ def text():

def test_count_words():
# pylint: disable=protected-access
count_words = Ignore._count_words
count_words = lambda phn: Ignore._count_words(
phn, wordsep=default_separator.word)
assert count_words(['']) == [0]
assert count_words(['a']) == [1]
assert count_words(['aaa']) == [1]
Expand Down Expand Up @@ -59,3 +63,17 @@ def test_mismatch(caplog, text, mode):
'words count mismatch on line 3 (expected 4 words but get 3)'
in messages)
assert 'words count mismatch on 67.0% of the lines (2/3)' in messages


# from https://github.com/bootphon/phonemizer/issues/169
def test_custom_separator(caplog):
phn = phonemize(
'try',
backend='espeak',
language='en-us',
separator=Separator(word='|', phone=' '),
words_mismatch='warn')

assert phn == 't ɹ aɪ |'
messages = [msg[2] for msg in caplog.record_tuples]
assert len(messages) == 0

0 comments on commit bae418c

Please sign in to comment.