fix: Split only on valid utf-8 boundary (#369)

* fix: Split only on valid utf-8 boundary * refact * make CI happy Signed-off-by: rany <rany2@riseup.net> --------- Signed-off-by: rany <rany2@riseup.net> Co-authored-by: rany <rany2@riseup.net>
2025-04-15 20:13:28 +08:00
parent ecf50b916e
commit 7aabaaf480
1 changed files with 127 additions and 31 deletions
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -99,20 +99,114 @@ def connect_id() -> str:
    return str(uuid.uuid4()).replace("-", "")
 def _find_last_newline_or_space_within_limit(text: bytes, limit: int) -> int:
    """
    Finds the index of the rightmost preferred split character (newline or space)
    within the initial `limit` bytes of the text.
    This helps find a natural word or sentence boundary for splitting, prioritizing
    newlines over spaces.
    Args:
        text (bytes): The byte string to search within.
        limit (int): The maximum index (exclusive) to search up to.
    Returns:
        int: The index of the last found newline or space within the limit,
             or -1 if neither is found in that range.
    """
    # Prioritize finding a newline character
    split_at = text.rfind(b"\n", 0, limit)
    # If no newline is found, search for a space
    if split_at < 0:
        split_at = text.rfind(b" ", 0, limit)
    return split_at
 def _find_safe_utf8_split_point(text_segment: bytes) -> int:
    """
    Finds the rightmost possible byte index such that the
    segment `text_segment[:index]` is a valid UTF-8 sequence.
    This prevents splitting in the middle of a multi-byte UTF-8 character.
    Args:
        text_segment (bytes): The byte segment being considered for splitting.
    Returns:
        int: The index of the safe split point. Returns 0 if no valid split
             point is found (e.g., if the first byte is part of a multi-byte
             sequence longer than the limit allows).
    """
    split_at = len(text_segment)
    while split_at > 0:
        try:
            text_segment[:split_at].decode("utf-8")
            # Found the largest valid UTF-8 sequence
            return split_at
        except UnicodeDecodeError:
            # The byte at split_at-1 is part of an incomplete multi-byte char, try earlier
            split_at -= 1
    return split_at
 def _adjust_split_point_for_xml_entity(text: bytes, split_at: int) -> int:
    """
    Adjusts a proposed split point backward to prevent splitting inside an XML entity.
    For example, if `text` is `b"this &amp; that"` and `split_at` falls between
    `&` and `;`, this function moves `split_at` to the index before `&`.
    Args:
        text (bytes): The text segment being considered.
        split_at (int): The proposed split point index, determined by whitespace
                        or UTF-8 safety.
    Returns:
        int: The adjusted split point index. It will be moved to the '&'
             if an unterminated entity is detected right before the original `split_at`.
             Otherwise, the original `split_at` is returned.
    """
    while split_at > 0 and b"&" in text[:split_at]:
        ampersand_index = text.rindex(b"&", 0, split_at)
        # Check if a semicolon exists between the ampersand and the split point
        if text.find(b";", ampersand_index, split_at) != -1:
            # Found a terminated entity (like &amp;), safe to break at original split_at
            break
        # Ampersand is not terminated before split_at, move split_at to it
        split_at = ampersand_index
    return split_at
 def split_text_by_byte_length(
    text: Union[str, bytes], byte_length: int
 ) -> Generator[bytes, None, None]:
    """
-    Splits a string into a list of strings of a given byte length
+    Splits text into chunks, each not exceeding a maximum byte length.
-    while attempting to keep words together. This function assumes
+
-    text will be inside of an XML tag.
+    This function prioritizes splitting at natural boundaries (newlines, spaces)
    while ensuring that:
    1. No chunk exceeds `byte_length` bytes.
    2. Chunks do not end with an incomplete UTF-8 multi-byte character.
    3. Chunks do not split XML entities (like `&amp;`) in the middle.
    Args:
-        text (str or bytes): The string to be split. If bytes, it must be UTF-8 encoded.
+        text (str or bytes): The input text. If str, it's encoded to UTF-8.
-        byte_length (int): The maximum byte length of each string in the list.
+        byte_length (int): The maximum allowed byte length for any yielded chunk.
                           Must be positive.
-    Yield:
+    Yields:
-        bytes: The next string in the list.
+        bytes: Text chunks (UTF-8 encoded, stripped of leading/trailing whitespace)
               that conform to the byte length and integrity constraints.
    Raises:
        TypeError: If `text` is not str or bytes.
        ValueError: If `byte_length` is not positive, or if a split point
                    cannot be determined (e.g., due to extremely small byte_length
                    relative to character/entity sizes).
    """
    if isinstance(text, str):
        text = text.encode("utf-8")
@@ -123,35 +217,37 @@ def split_text_by_byte_length(
        raise ValueError("byte_length must be greater than 0")
    while len(text) > byte_length:
-        # Find the last space in the string
+        # Find the initial split point based on whitespace or UTF-8 boundary
-        split_at = text.rfind(b" ", 0, byte_length)
+        split_at = _find_last_newline_or_space_within_limit(text, byte_length)
-        # If no space found, split_at is byte_length
+        if split_at < 0:
-        split_at = split_at if split_at != -1 else byte_length
+            ## No newline or space found, so we need to find a safe UTF-8 split point
            split_at = _find_safe_utf8_split_point(text)
-        # Verify all & are terminated with a ;
+        # Adjust the split point to avoid cutting in the middle of an xml entity, such as '&amp;'
-        while b"&" in text[:split_at]:
+        split_at = _adjust_split_point_for_xml_entity(text, split_at)
            ampersand_index = text.rindex(b"&", 0, split_at)
            if text.find(b";", ampersand_index, split_at) != -1:
                break
-            split_at = ampersand_index - 1
+        if split_at < 0:
-            if split_at < 0:
+            # This should not happen if byte_length is reasonable,
-                raise ValueError("Maximum byte length is too small or invalid text")
+            # but guards against edge cases.
-            if split_at == 0:
+            raise ValueError(
-                break
+                "Maximum byte length is too small or "
                "invalid text structure near '&' or invalid UTF-8"
            )
-        # Append the string to the list
+        # Yield the chunk
-        new_text = text[:split_at].strip()
+        chunk = text[:split_at].strip()
-        if new_text:
+        if chunk:
-            yield new_text
+            yield chunk
        if split_at == 0:
            split_at = 1
        text = text[split_at:]
-    new_text = text.strip()
+        # Prepare for the next iteration
-    if new_text:
+        # If split_at became 0 after adjustment, advance by 1 to avoid infinite loop
-        yield new_text
+        text = text[split_at if split_at > 0 else 1 :]
    # Yield the remaining part
    remaining_chunk = text.strip()
    if remaining_chunk:
        yield remaining_chunk
 def mkssml(tc: TTSConfig, escaped_text: Union[str, bytes]) -> str: