From 9ae46c8360303417489d2c1071f29972cd8ab171 Mon Sep 17 00:00:00 2001 From: SWivid Date: Fri, 28 Nov 2025 13:08:07 +0000 Subject: [PATCH] Replace jieba pkg with rjieba - a jieba-rs Python binding --- pyproject.toml | 4 ++-- src/f5_tts/model/utils.py | 8 ++------ src/f5_tts/runtime/triton_trtllm/Dockerfile.server | 2 +- .../triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py | 4 ++-- src/f5_tts/train/datasets/prepare_emilia.py | 2 +- src/f5_tts/train/datasets/prepare_wenetspeech4tts.py | 2 +- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7e7ed45..a602db3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "f5-tts" -version = "1.1.9" +version = "1.1.10" description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching" readme = "README.md" license = {text = "MIT License"} @@ -22,13 +22,13 @@ dependencies = [ "ema_pytorch>=0.5.2", "gradio>=5.0.0", "hydra-core>=1.3.0", - "jieba", "librosa", "matplotlib", "numpy<=1.26.4; python_version<='3.10'", "pydantic<=2.10.6", "pydub", "pypinyin", + "rjieba", "safetensors", "soundfile", "tomli", diff --git a/src/f5_tts/model/utils.py b/src/f5_tts/model/utils.py index cd5b3a0..ff34956 100644 --- a/src/f5_tts/model/utils.py +++ b/src/f5_tts/model/utils.py @@ -7,7 +7,7 @@ import random from collections import defaultdict from importlib.resources import files -import jieba +import rjieba import torch from pypinyin import Style, lazy_pinyin from torch.nn.utils.rnn import pad_sequence @@ -146,10 +146,6 @@ def get_tokenizer(dataset_name, tokenizer: str = "pinyin"): def convert_char_to_pinyin(text_list, polyphone=True): - if jieba.dt.initialized is False: - jieba.default_logger.setLevel(50) # CRITICAL - jieba.initialize() - final_text_list = [] custom_trans = str.maketrans( {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"} @@ -163,7 +159,7 @@ def convert_char_to_pinyin(text_list, polyphone=True): for text in text_list: char_list = [] text = text.translate(custom_trans) - for seg in jieba.cut(text): + for seg in rjieba.cut(text): seg_byte_len = len(bytes(seg, "UTF-8")) if seg_byte_len == len(seg): # if pure alphabets and symbols if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"": diff --git a/src/f5_tts/runtime/triton_trtllm/Dockerfile.server b/src/f5_tts/runtime/triton_trtllm/Dockerfile.server index 861e266..dd176a5 100644 --- a/src/f5_tts/runtime/triton_trtllm/Dockerfile.server +++ b/src/f5_tts/runtime/triton_trtllm/Dockerfile.server @@ -1,3 +1,3 @@ FROM nvcr.io/nvidia/tritonserver:24.12-py3 -RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 jieba pypinyin librosa vocos +RUN pip install tritonclient[grpc] tensorrt-llm==0.16.0 torchaudio==2.5.1 rjieba pypinyin librosa vocos WORKDIR /workspace \ No newline at end of file diff --git a/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py b/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py index b1115a3..0001937 100644 --- a/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py +++ b/src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py @@ -26,7 +26,7 @@ import json import os -import jieba +import rjieba import torch import torchaudio import triton_python_backend_utils as pb_utils @@ -66,7 +66,7 @@ def convert_char_to_pinyin(reference_target_texts_list, polyphone=True): for text in reference_target_texts_list: char_list = [] text = text.translate(custom_trans) - for seg in jieba.cut(text): + for seg in rjieba.cut(text): seg_byte_len = len(bytes(seg, "UTF-8")) if seg_byte_len == len(seg): # if pure alphabets and symbols if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"": diff --git a/src/f5_tts/train/datasets/prepare_emilia.py b/src/f5_tts/train/datasets/prepare_emilia.py index 7c6b805..1f92fb8 100644 --- a/src/f5_tts/train/datasets/prepare_emilia.py +++ b/src/f5_tts/train/datasets/prepare_emilia.py @@ -225,5 +225,5 @@ if __name__ == "__main__": # bad zh asr cnt 230435 (samples) # bad eh asr cnt 37217 (samples) - # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme) + # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme) # please be careful if using pretrained model, make sure the vocab.txt is same diff --git a/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py b/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py index 6498421..a598966 100644 --- a/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py +++ b/src/f5_tts/train/datasets/prepare_wenetspeech4tts.py @@ -122,5 +122,5 @@ if __name__ == "__main__": # - - 1459 (polyphone) # char vocab size 5264 5219 5042 - # vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme) + # vocab size may be slightly different due to rjieba tokenizer and pypinyin (e.g. way of polyphoneme) # please be careful if using pretrained model, make sure the vocab.txt is same